chore(shortint): add ks32 & prod params to tests

Make KS32 params the default, add them to tests, and add a PARAM_PROD param alias which is still the KS-PBS so that it's still tested. Prod params are tested in a separate workflow to avoid growing the time tests take. BREAKING CHANGE: `PARAM_MESSAGE_2_CARRY_2` now returns `KeySwitch32PBSParameters` instead of `ClassicPBSParameters`.
chore: remove integer test filter which is not relevant anymore
2026-04-28 03:01:21 -04:00 · 2026-02-09 11:43:42 +01:00 · 2026-02-09 11:42:58 +01:00
409 changed files with 19306 additions and 47923 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -68,12 +68,6 @@ runs:
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
        sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
-
-        # Disable unattended-upgrades to avoid lock issues
-        sudo systemctl disable --now unattended-upgrades
-
-        sudo apt-get clean
-        sudo rm -rf /var/lib/apt/lists/*
        sudo apt update
        sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"

--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -14,7 +14,9 @@ env:
  SLACKIFY_MARKDOWN: true
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -30,11 +32,38 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  setup-instance:
+    name: aws_tfhe_backward_compat_tests/setup-instance
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name != 'push'
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
  backward-compat-tests:
    name: aws_tfhe_backward_compat_tests/backward-compat-tests (bpr)
-    if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name != 'push'
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
+    needs: [ setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -54,7 +83,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -83,7 +112,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -105,3 +134,27 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_backward_compat_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, backward-compat-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -15,7 +15,9 @@ env:
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -132,15 +134,41 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

-  fast-tests:
-    name: Fast CPU tests
-    needs: should-run
+  setup-instance:
+    name: aws_tfhe_fast_tests/setup-instance
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  fast-tests:
+    name: Fast CPU tests
+    needs: [ should-run, setup-instance ]
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -191,7 +219,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            ~/.nvm
@@ -204,7 +232,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -261,3 +289,27 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_fast_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, fast-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -17,7 +17,9 @@ env:
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -69,18 +71,44 @@ jobs:
              - tfhe/src/integer/**
              - .github/workflows/aws_tfhe_integer_tests.yml

-  unsigned-integer-tests:
-    name: aws_tfhe_integer_tests/unsigned-integer-tests
+  setup-instance:
+    name: aws_tfhe_integer_tests/setup-instance
    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
      github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  unsigned-integer-tests:
+    name: aws_tfhe_integer_tests/unsigned-integer-tests
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
@@ -130,3 +158,27 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_integer_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [setup-instance, unsigned-integer-tests]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -13,7 +13,8 @@ env:
  SLACKIFY_MARKDOWN: true
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -34,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +100,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_param_prod_tests.yml
+++ b/.github/workflows/aws_tfhe_param_prod_tests.yml
@@ -1,5 +1,5 @@
-# Compile and test zk-cuda-backend
-name: gpu_zk_tests
+# Run a small subset of tests to ensure quick feedback.
+name: aws_tfhe_param_prod_tests

 env:
  CARGO_TERM_COLOR: always
@@ -17,12 +17,13 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
  # Secrets will be available only to zama-ai organization members
  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+    types: [ labeled ]

 permissions:
  contents: read
@@ -31,15 +32,22 @@ permissions:

 jobs:
  should-run:
-    name: gpu_zk_tests/should-run
+    name: aws_tfhe_param_prod_tests/should-run
+    if: (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read  # Needed to check for file change
+      pull-requests: read # Needed to check for file change
    outputs:
-      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -50,21 +58,34 @@ jobs:
        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
-            gpu:
+            dependencies:
              - tfhe/Cargo.toml
-              - tfhe/build.rs
-              - backends/zk-cuda-backend/**
-              - tfhe/src/integer/gpu/zk/**
+              - tfhe-csprng/**
+              - tfhe-fft/**
              - tfhe-zk-pok/**
-              - 'tfhe/docs/**/**.md'
-              - '.github/workflows/gpu_zk_tests.yml'
-              - ci/slab.toml
-
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            versionable:
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' )
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
  setup-instance:
-    name: gpu_zk_tests/setup-instance
-    needs: should-run
+    name: aws_tfhe_param_prod_tests/setup-instance
    if: github.event_name == 'workflow_dispatch' ||
-      needs.should-run.outputs.gpu_test == 'true'
+      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -72,14 +93,14 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: gpu-test
+          backend: aws
+          profile: cpu-big

      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
@@ -88,85 +109,60 @@ jobs:
        run: |
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

-  cuda-tests-linux:
-    name: gpu_zk_tests/cuda-tests-linux
+  param-prod-tests:
+    name: aws_tfhe_param_prod_tests/param-prod-tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request' ||
-      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow_ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.8"
-            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
-
      - name: Install latest stable
        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

-      - name: Enable nvidia multi-process service
+      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
-          nvidia-cuda-mps-control -d
+          make gen_key_cache

-      - name: Run zk-cuda-backend integration tests
+      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
-          make test_zk_cuda_backend
-          make test_zk_pok_gpu
-          make test_integer_zk_gpu
-          make test_integer_zk_experimental_gpu
-
-  slack-notify:
-    name: gpu_zk_tests/slack-notify
-    needs: [ setup-instance, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
+          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=FALSE make test_param_prod_shortint_ci
      - name: Set pull-request URL
-        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
          PR_NUMBER: ${{ github.event.pull_request.number }}

-      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Slack Notification
+        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
+        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
-          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "ZK GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: gpu_zk_tests/teardown-instance
+    name: aws_tfhe_param_prod_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, cuda-tests-linux ]
+    needs: [ setup-instance, param-prod-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,8 +171,8 @@ jobs:
          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (param-prod-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -17,7 +17,9 @@ env:
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -70,18 +72,44 @@ jobs:
              - tfhe/src/integer/**
              - .github/workflows/aws_tfhe_signed_integer_tests.yml

-  signed-integer-tests:
-    name: aws_tfhe_signed_integer_tests/signed-integer-tests
+  setup-instance:
+    name: aws_tfhe_signed_integer_tests/setup-instance
    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
      github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  signed-integer-tests:
+    name: aws_tfhe_signed_integer_tests/signed-integer-tests
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -134,3 +162,27 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_signed_integer_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [setup-instance, signed-integer-tests]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -14,7 +14,9 @@ env:
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -141,15 +143,43 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

-  cpu-tests:
-    name: aws_tfhe_tests/cpu-tests
-    needs: should-run
+  setup-instance:
+    name: aws_tfhe_tests/setup-instance
    if: github.event_name != 'pull_request' ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cpu-tests:
+    name: aws_tfhe_tests/cpu-tests
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    needs: [ should-run, setup-instance ]
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
      cancel-in-progress: true
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -239,3 +269,27 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, cpu-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -13,7 +13,9 @@ env:
  SLACKIFY_MARKDOWN: true
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,53 +29,39 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  should-run:
-    name: aws_tfhe_wasm_tests/should-run
-    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
+  setup-instance:
+    name: aws_tfhe_wasm_tests/setup-instance
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
    outputs:
-      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
-          steps.changed-files.outputs.wasm_any_changed }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small

-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
-        with:
-          files_yaml: |
-            wasm:
-                - Cargo.toml
-                - tfhe/Cargo.toml
-                - tfhe-csprng/**
-                - tfhe-fft/**
-                - tfhe-zk-pok/**
-                - tfhe/src/core_crypto/**
-                - tfhe/src/shortint/**
-                - tfhe/src/integer/**
-                - tfhe/src/high_level_api/**
-                - tfhe/src/js_on_wasm_api/**
-                - tfhe/js_on_wasm_tests/**
-                - tfhe/web_wasm_parallel_tests/**
-                - utils/tfhe-versionable/**
-                - .github/workflows/aws_tfhe_wasm_tests.yml
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  wasm-tests:
    name: aws_tfhe_wasm_tests/wasm-tests
-    needs: should-run
-    if: github.event_name == 'workflow_dispatch' ||
-      (contains(github.event.label.name, 'approved') && needs.should-run.outputs.wasm_test == 'true')
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
      cancel-in-progress: true
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -92,7 +80,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            ~/.nvm
@@ -105,7 +93,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -149,3 +137,27 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_wasm_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, wasm-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -14,7 +14,6 @@ on:
          - signed_integer
          - integer_compression
          - integer_zk
-          - msm_zk
          - shortint
          - shortint_oprf
          - hlapi_unsigned
@@ -22,7 +21,6 @@ on:
          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
-          - hlapi_kvstore
          - tfhe_zk_pok
          - boolean
          - pbs
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -36,7 +36,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -50,40 +50,6 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-cpu-zk-server:
-    name: benchmark_documentation/run-benchmarks-cpu-zk-server
-    uses: ./.github/workflows/benchmark_cpu_common.yml
-    if: inputs.run-cpu-benchmarks
-    with:
-      command: integer_zk
-      op_flavor: default
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-cpu-zk-client:
-    name: benchmark_documentation/run-benchmarks-cpu-zk-client
-    uses: ./.github/workflows/benchmark_wasm_client_common.yml
-    if: inputs.run-cpu-benchmarks
-    with:
-      browser: chrome
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
  run-benchmarks-gpu-integer:
    name: benchmark_documentation/run-benchmarks-gpu-integer
    uses: ./.github/workflows/benchmark_gpu_common.yml
@@ -91,7 +57,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc20
+      command: integer_multi_bit
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -110,7 +76,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -172,7 +138,6 @@ jobs:
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
-      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
    ]
    uses: ./.github/workflows/generate_svgs.yml
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,8 +31,6 @@ on:
          - pbs128
          - ks
          - ks_pbs
-          - tfhe_zk_pok
-          - msm_zk
          - integer_zk
          - integer_aes
          - integer_aes256
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -94,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -197,7 +197,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
        with:
          path: |
            ~/.cargo/registry
@@ -207,14 +207,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -326,7 +326,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -1,28 +1,279 @@
-# Run CUDA benchmarks on Hyperstack VM and return parsed results to Slab CI bot.
+# Run CUDA benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
 name: benchmark_gpu_weekly

-run-name: GPU weekly benchmarks
-
 on:
  schedule:
+    # Weekly schedules are separated in several groups to avoid spawning too many the machines at once thus risking resource shortages.
+    # Group 1
+    # -------
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
    - cron: '0 1 * * 6'
+    # Group 2
+    # -------
+    # Weekly benchmarks will be triggered each Sunday at 1a.m.
+    - cron: '0 1 * * 0'
+    # Group 3
+    # -------
+    # Weekly benchmarks will be triggered each Sunday at 9p.m.
+    - cron: '0 9 * * 0'
+

 permissions: {}

 # zizmor: ignore[concurrency-limits] only GitHub can trigger this workflow

 jobs:
-  run-benchmarks-8-h100-sxm5-summary:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-summary
+  prepare-inputs:
+    name: benchmark_cpu_weekly/prepare-inputs
    if: github.repository == 'zama-ai/tfhe-rs'
+    runs-on: ubuntu-latest
+    outputs:
+      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
+      is_weekly_bench_group_2: ${{ steps.check_bench_group_2.outputs.is_weekly_bench_group_2 }}
+      is_weekly_bench_group_3: ${{ steps.check_bench_group_3.outputs.is_weekly_bench_group_3 }}
+    steps:
+      - name: Check is weekly bench group 1
+        id: check_bench_group_1
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          echo "is_weekly_bench_group_1=${{ github.event.schedule == '0 1 * * 6' }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Check is weekly bench group 2
+        id: check_bench_group_2
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          echo "is_weekly_bench_group_2=${{ github.event.schedule == '0 1 * * 0' }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Check is weekly bench group 3
+        id: check_bench_group_3
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          echo "is_weekly_bench_group_3=${{ github.event.schedule == '0 9 * * 0' }}" >> "${GITHUB_OUTPUT}"
+
+
+  run-benchmarks-8-h100-sxm5-integer:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: summary
+      command: integer_multi_bit
+      op_flavor: default
+      bench_type: both
+      precisions_set: fast
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-sxm5-integer-compression:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: integer_compression
+      op_flavor: default
+      bench_type: both
+      precisions_set: fast
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-sxm5-integer-zk-aes:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk-aes
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: integer_zk,integer_aes,integer_aes256
+      op_flavor: default
+      bench_type: both
+      precisions_set: fast
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-sxm5-noise-squash:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: hlapi_noise_squash
+      op_flavor: default
+      bench_type: both
+      precisions_set: fast
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-1-h100-core-crypto:
+    name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: pbs,pbs128,ks,ks_pbs
+      bench_type: latency
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  # -----------------------------------------------------
+  # ERC20 benchmarks
+  # -----------------------------------------------------
+
+  run-benchmarks-1-h100-erc20:
+    name: benchmark_gpu_weekly/run-benchmarks-1-h100-erc20
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: hlapi_erc20
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-2-h100-erc20:
+    name: benchmark_gpu_weekly/run-benchmarks-2-h100-erc20
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+      command: hlapi_erc20
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-erc20:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-erc20
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100-SXM5x8
+      command: hlapi_erc20
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  # -----------------------------------------------------
+  # DEX benchmarks
+  # -----------------------------------------------------
+
+  run-benchmarks-1-h100-dex:
+    name: benchmark_gpu_weekly/run-benchmarks-1-h100-dex
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: hlapi_dex
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-2-h100-dex:
+    name: benchmark_gpu_weekly/run-benchmarks-2-h100-dex
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+      command: hlapi_dex
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-dex:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-dex
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    needs: prepare-inputs
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100-SXM5x8
+      command: hlapi_dex
      bench_type: both
-      params_type: classical + multi_bit
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_summary.yml
+++ b/.github/workflows/benchmark_summary.yml
@@ -1,136 +0,0 @@
-# Run all benchmarks displayed in the internal documentation.
-name: benchmark_summary
-
-run-name: Benchmark Summary
-
-on:
-  workflow_dispatch:
-    inputs:
-      run-cpu-benchmarks:
-        description: "Run CPU benchmarks"
-        type: boolean
-        default: true
-      run-gpu-benchmarks:
-        description: "Run GPU benchmarks"
-        type: boolean
-        default: true
-      gpu-profile:
-        description: "GPU Instance type"
-        required: true
-        default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
-        type: choice
-        options:
-          - "l40 (n3-L40x1)"
-          - "4-l40 (n3-L40x4)"
-          - "8-l40 (n3-L40x8)"
-          - "multi-a100-nvlink (n3-A100x8-NVLink)"
-          - "single-h100 (n3-H100x1)"
-          - "2-h100 (n3-H100x2)"
-          - "4-h100 (n3-H100x4)"
-          - "multi-h100 (n3-H100x8)"
-          - "multi-h100-nvlink (n3-H100x8-NVLink)"
-          - "multi-h100-sxm5 (n3-H100-SXM5x8)"
-      bench_type:
-        description: "Benchmarks type"
-        type: choice
-        default: both
-        options:
-          - latency
-          - throughput
-          - both
-      run-hpu-benchmarks:
-        description: "Run HPU benchmarks"
-        type: boolean
-        default: true
-
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-
-jobs:
-  parse-gpu-inputs:
-    name: benchmark_summary/parse-gpu-inputs
-    if: inputs.run-gpu-benchmarks
-    runs-on: ubuntu-latest
-    outputs:
-      profile: ${{ steps.parse_profile.outputs.profile }}
-      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
-    env:
-      INPUTS_PROFILE: ${{ inputs.gpu-profile }}
-    steps:
-      - name: Parse profile
-        id: parse_profile
-        run: |
-          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
-          # shellcheck disable=SC2001
-          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
-          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
-
-      - name: Parse hardware name
-        id: parse_hardware_name
-        run: |
-          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
-          # shellcheck disable=SC2001
-          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
-          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
-
-  run-benchmarks-cpu:
-    name: benchmark_documentation/run-benchmarks-cpu-integer
-    uses: ./.github/workflows/benchmark_cpu_common.yml
-    if: inputs.run-cpu-benchmarks
-    with:
-      command: summary
-      bench_type: ${{ inputs.bench_type }}
-      params_type: classical + multi_bit
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-gpu:
-    name: benchmark_documentation/run-benchmarks-gpu
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-benchmarks
-    needs: parse-gpu-inputs
-    with:
-      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
-      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
-      command: summary
-      bench_type: ${{ inputs.bench_type }}
-      params_type: classical + multi_bit
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-# TODO add make recipe for HPU benchmarks
-#  run-benchmarks-hpu:
-#    name: benchmark_documentation/run-benchmarks-hpu
-#    uses: ./.github/workflows/benchmark_hpu_common.yml
-#    if: inputs.run-hpu-benchmarks
-#    with:
-#      command: summary
-#      bench_type: ${{ inputs.bench_type }}
-#      v80_pcie_dev: 24
-#      v80_serial_number: XFL12NWY3ZKG
-#    secrets:
-#      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-#      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-#      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-#      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-#      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-#      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-#      SLAB_URL: ${{ secrets.SLAB_URL }}
-#      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-#      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -58,19 +58,171 @@ jobs:
              - tfhe/web_wasm_parallel_tests/**
              - .github/workflows/wasm_client_benchmark.yml

-  run-benchmarks-cpu-zk-client:
-    name: benchmark_documentation/run-benchmarks-cpu-zk-client
-    uses: ./.github/workflows/benchmark_wasm_client_common.yml
-    needs: should-run
+  setup-instance:
+    name: benchmark_wasm_client/setup-instance
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  wasm-client-benchmarks:
+    name: benchmark_wasm_client/wasm-client-benchmarks
+    needs: setup-instance
+    if: needs.setup-instance.result != 'skipped'
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      max-parallel: 1
+      matrix:
+        browser: [ chrome, firefox ]
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Get Node version
+        run: |
+          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
+
+      - name: Node cache restoration
+        id: node-cache
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install Node
+        if: steps.node-cache.outputs.cache-hit != 'true'
+        run: |
+          make install_node
+
+      - name: Node cache save
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        if: steps.node-cache.outputs.cache-hit != 'true'
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install web resources
+        run: |
+          make install_"${BROWSER}"_browser
+          make install_"${BROWSER}"_web_driver
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_web_js_api_parallel_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Run benchmarks (unsafe coop)
+        run: |
+          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Parse results
+        run: |
+          make parse_wasm_benchmarks
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "m6i.4xlarge" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --key-gen
+          rm tfhe-benchmark/wasm_pk_gen.csv
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        with:
+          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: benchmark_wasm_client/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, wasm-client-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -1,234 +0,0 @@
-# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
-name: benchmark_wasm_client_common
-
-on:
-  workflow_call:
-    inputs:
-      browser:
-        type: string # Use comma separated values to generate an array
-        default: chrome,firefox
-    secrets:
-      REPO_CHECKOUT_TOKEN:
-        required: true
-      SLAB_ACTION_TOKEN:
-        required: true
-      SLAB_BASE_URL:
-        required: true
-      SLAB_URL:
-        required: true
-      JOB_SECRET:
-        required: true
-      SLACK_CHANNEL:
-        required: true
-      BOT_USERNAME:
-        required: true
-      SLACK_WEBHOOK:
-        required: true
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members and GitHub can trigger this workflow
-
-jobs:
-  prepare-matrix:
-    name: benchmark_wasm_client_common/prepare-matrix
-    runs-on: ubuntu-latest
-    outputs:
-      browser: ${{ steps.set_matrix_arg.outputs.browser }}
-    steps:
-      - name: Parse user inputs
-        shell: python
-        env:
-          INPUTS_BROWSER: ${{ inputs.browser }}
-        run: |
-          import os
-
-          inputs_browser = os.environ["INPUTS_BROWSER"]
-          env_file = os.environ["GITHUB_ENV"]
-
-          split_browser = inputs_browser.replace(" ", "").split(",")
-
-          with open(env_file, "a") as f:
-            f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
-
-      - name: Set martix arguments output
-        id: set_matrix_arg
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
-
-  setup-instance:
-    name: benchmark_wasm_client_common/setup-instance
-    needs: prepare-matrix
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-  wasm-client-benchmarks:
-    name: benchmark_wasm_client_common/wasm-client-benchmarks
-    needs: [ prepare-matrix, setup-instance ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      max-parallel: 1
-      matrix:
-        browser: ${{ fromJSON(needs.prepare-matrix.outputs.browser) }}
-    steps:
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=${COMMIT_DATE}";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-        env:
-          SHA: ${{ github.sha }}
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: nightly
-
-      - name: Get Node version
-        run: |
-          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
-
-      - name: Node cache restoration
-        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install Node
-        if: steps.node-cache.outputs.cache-hit != 'true'
-        run: |
-          make install_node
-
-      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        if: steps.node-cache.outputs.cache-hit != 'true'
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install web resources
-        run: |
-          make install_"${BROWSER}"_browser
-          make install_"${BROWSER}"_web_driver
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Run benchmarks
-        run: |
-          make bench_web_js_api_parallel_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Run benchmarks (unsafe coop)
-        run: |
-          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Parse results
-        run: |
-          make parse_wasm_benchmarks
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
-          --database tfhe_rs \
-          --hardware "m6i.4xlarge" \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --key-gen
-          rm tfhe-benchmark/wasm_pk_gen.csv
-        env:
-          REF_NAME: ${{ github.ref_name }}
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
-        with:
-          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          repository: zama-ai/slab
-          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
-          --slab-url "${SLAB_URL}"
-        env:
-          JOB_SECRET: ${{ secrets.JOB_SECRET }}
-          SLAB_URL: ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: benchmark_wasm_client_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, wasm-client-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -57,7 +57,9 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  SLACKIFY_MARKDOWN: true
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
  LINELINT_VERSION: 0.0.6
  LINELINT_CHECKSUM: "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b"

@@ -67,10 +69,37 @@ permissions:
 # zizmor: ignore[concurrency-limits] caller workflow is responsible for the concurrency

 jobs:
-  prepare-matrix:
-    name: cargo_build_common/prepare-matrix
+  setup-instance:
+    name: cargo_build_common/setup-instance
    if: inputs.run-pcc-cpu-batch || inputs.run-pcc-hpu || inputs.run-build || inputs.run-build-layers || inputs.run-build-tfhe-full || inputs.run-build-c-api
    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      run_attempt: ${{ github.run_attempt }} # On a re-run with a successful previous run for this job, the run_attempt will not be incremented
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  prepare-matrix:
+    name: cargo_build_common/prepare-matrix
+    runs-on: ubuntu-latest
+    needs: setup-instance
    outputs:
      runners: ${{ steps.set_matrix_runners.outputs.runners }}
    steps:
@@ -78,12 +107,12 @@ jobs:
        shell: python
        env:
          INPUTS_EXTRA_RUNNERS_TO_USE: ${{ inputs.extra-runners-to-use }}
-          REMOTE_RUNNER: "runs-on=${{ github.run_id }}/runner=cpu-small"
+          REMOTE_RUNNER_LABEL: ${{ needs.setup-instance.outputs.runner-name }}
        run: |
          import os
          
          inputs_extra_runners = os.environ["INPUTS_EXTRA_RUNNERS_TO_USE"]
-          remote_runner_label = os.environ["REMOTE_RUNNER"]
+          remote_runner_label = os.environ["REMOTE_RUNNER_LABEL"]
          env_file = os.environ["GITHUB_ENV"]
          
          runners = [remote_runner_label, ]
@@ -101,7 +130,7 @@ jobs:

  builds:
    name: cargo_build_common/builds
-    needs: prepare-matrix
+    needs: [ setup-instance, prepare-matrix ]
    runs-on: ${{ matrix.runner }}
    strategy:
      matrix:
@@ -130,35 +159,6 @@ jobs:
          chmod +x linelint-linux-amd64
          ln -s "$(pwd)/linelint-linux-amd64" /usr/local/bin/linelint

-      - name: Get Node version
-        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
-        run: |
-          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
-
-      - name: Node cache restoration
-        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
-        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install Node
-        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
-        run: |
-          make install_node
-
-      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
      - name: Run pcc checks batch
        if: inputs.run-pcc-cpu-batch
        run: |
@@ -230,3 +230,29 @@ jobs:
        if: ${{ always() }}
        run: | # zizmor: ignore[template-injection] this context variable is safe
          echo "result=${{ job.status }}" >> "${GITHUB_OUTPUT}"
+
+  teardown-instance:
+    name: cargo_build_common/teardown-instance
+    if: ${{ always() &&
+      needs.setup-instance.result == 'success' &&
+      github.run_attempt == needs.setup-instance.outputs.run_attempt }} # Only run if setup-instance has been executed during this run attempt
+    needs: [setup-instance, builds]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cargo-builds) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -18,7 +18,7 @@ jobs:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
        with:
-          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc|perf)(\([\w\-_]+\))?\!?\:) .+$'
+          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
          flags: "gs"
          error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
          excludeDescription: "true" # optional: this excludes the description body of a pull request
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -43,14 +43,14 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@0dce2577a4760a2749d8cfb7a84b7d5585ebcb7d # v0.5.0
+        uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@d5d20e15f2736816ee0e001ba8b24b54d9ffcff4 # v5.0.0
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6124774845927d14c601359ab8138699fa5b70c3 # v4.0.1
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -23,12 +23,30 @@ permissions:
 # zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow

 jobs:
+  setup-instance:
+    name: code_coverage/setup-instance
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
  code-coverage-tests:
    name: code_coverage/code-coverage-tests
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}_${{ github.event_name }}
      cancel-in-progress: true
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
@@ -103,3 +121,26 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: code_coverage/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, code-coverage-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (code-coverage-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -10,10 +10,10 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  SLACKIFY_MARKDOWN: true
-  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,44 +27,39 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  should-run:
-    name: csprng_randomness_tests/should-run
-    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
+  setup-instance:
+    name: csprng_randomness_tests/setup-instance
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
    outputs:
-      csprng_test: ${{ github.event_name == 'workflow_dispatch' ||
-        steps.changed-files.outputs.csprng_any_changed }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small

-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
-        with:
-          files_yaml: |
-            csprng:
-              - Cargo.toml
-              - tfhe/Cargo.toml
-              - tfhe-csprng/**
-              - utils/tfhe-versionable/**
-              - .github/workflows/csprng_randomness_tests.yml
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  csprng-randomness-tests:
    name: csprng_randomness_tests/csprng-randomness-tests
-    needs: should-run
-    if: github.event_name == 'workflow_dispatch' ||
-      (contains(github.event.label.name, 'approved') && needs.should-run.outputs.csprng_test == 'true')
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}_${{ github.sha }}_${{ github.event_name }}
      cancel-in-progress: true
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -81,18 +76,34 @@ jobs:
        run: |
          make dieharder_csprng

-      - name: Set pull-request URL
-        if: ${{ failure() && github.event_name == 'pull_request' }}
-        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
-        env:
-          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: csprng_randomness_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, csprng-randomness-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -9,9 +9,6 @@ on:
        type: string
      layer:
        type: string
-      bench_subset:
-        type: string
-        default: all
      pbs_kind: # Valid values are 'classical', 'multi_bit' or 'any'
        type: string
      grouping_factor: # Valid values are 2, 3, or 4
@@ -19,9 +16,6 @@ on:
        default: 4
      bench_type: # Valid values are 'latency', 'throughput'
        type: string
-      name_suffix:
-        type: string
-        default: _mean_avx512
      backend_comparison:
        type: boolean
        default: false
@@ -66,8 +60,6 @@ jobs:
          --pbs-kind "${PBS_KIND}" \
          --grouping-factor "${GROUPING_FACTOR}" \
          --bench-type "${BENCH_TYPE}" \
-          --bench-subset "${BENCH_SUBSET}" \
-          --name-suffix "${NAME_SUFFIX}" \
          --time-span-days "${TIME_SPAN}"
        env:
          OUTPUT_FILENAME: ${{ inputs.output_filename }}
@@ -78,8 +70,6 @@ jobs:
          PBS_KIND: ${{ inputs.pbs_kind }}
          GROUPING_FACTOR: ${{ inputs.grouping_factor }}
          BENCH_TYPE: ${{ inputs.bench_type }}
-          BENCH_SUBSET: ${{ inputs.bench_subset }}
-          NAME_SUFFIX: ${{ inputs.name_suffix }}
          TIME_SPAN: ${{ inputs.time_span_days }}
          DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
          DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
@@ -89,7 +79,7 @@ jobs:
        if: inputs.backend_comparison == false
        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
-          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
+          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
          # This will upload all the file generated
          path: ${{ inputs.output_filename }}*.svg
          retention-days: 60
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -51,7 +51,7 @@ jobs:
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

  cpu-integer-throughput-table:
-    name: generate_documentation_svgs/cpu-integer-throughput-table
+    name: generate_documentation_svgs/cpu-integer-latency-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
@@ -150,124 +150,6 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  # -----------------------------------------------------------
-  # ZK benchmarks tables
-  # -----------------------------------------------------------
-
-  cpu-zk-server-latency-table:
-    name: generate_documentation_svgs/cpu-zk-server-latency-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: hpc7a.96xlarge
-      layer: integer
-      bench_subset: zk
-      pbs_kind: classical
-      bench_type: latency
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-zk-benchmark-latency
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  cpu-zk-server-throughput-table:
-    name: generate_documentation_svgs/cpu-zk-server-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: hpc7a.96xlarge
-      layer: integer
-      bench_subset: zk
-      pbs_kind: classical
-      bench_type: throughput
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-zk-benchmark-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  cpu-zk-client-latency-table:
-    name: generate_documentation_svgs/cpu-zk-client-latency-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: m6i.4xlarge
-      layer: wasm
-      bench_subset: zk
-      pbs_kind: classical
-      bench_type: latency
-      name_suffix: _chrome_mean
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-zk-wasm-benchmark-latency
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  # -----------------------------------------------------------
-  # ERC20 benchmarks tables
-  # -----------------------------------------------------------
-
-  cpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: hpc7a.96xlarge
-      layer: hlapi
-      bench_subset: erc20
-      pbs_kind: classical
-      bench_type: both
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  gpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-gpu-svgs
-    with:
-      backend: gpu
-      hardware_name: n3-H100-SXM5x8
-      layer: hlapi
-      bench_subset: erc20
-      pbs_kind: multi_bit
-      grouping_factor: 4
-      bench_type: both
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  hpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-hpu-svgs
-    with:
-      backend: hpu
-      hardware_name: hpu_x1
-      layer: hlapi
-      bench_subset: erc20
-      pbs_kind: classical
-      bench_type: both
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
  # -----------------------------------------------------------
  # PBS benchmarks tables
  # -----------------------------------------------------------
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,11 +93,6 @@ jobs:

      - name: Find tools
        run: |
-          # Disable unattended-upgrades to avoid lock issues
-          sudo systemctl disable --now unattended-upgrades
-
-          sudo apt-get clean
-          sudo rm -rf /var/lib/apt/lists/*
          sudo apt update && sudo apt install -y valgrind 
          find /usr -executable -name "compute-sanitizer"
          which valgrind
@@ -111,10 +106,6 @@ jobs:
        run: |
          make test_high_level_api_gpu_valgrind

-      - name: Run CUDA backend racecheck tests
-        run: |
-          make test_cuda_backend_race_check
-
  slack-notify:
    name: gpu_code_validation_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
@@ -146,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -64,6 +64,8 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
@@ -84,7 +86,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -183,7 +185,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -64,6 +64,8 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_fast_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_fast_tests/setup-instance
@@ -77,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -66,6 +66,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/**_multi_gpu_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_full_multi_gpu_tests/setup-instance
@@ -80,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +187,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -65,6 +65,8 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
@@ -85,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -191,7 +193,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -31,50 +31,18 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  should-run:
-    name: gpu_memory_sanitizer/should-run
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
-    outputs:
-      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
-        with:
-          files_yaml: |
-            gpu:
-              - Cargo.toml
-              - tfhe/Cargo.toml
-              - tfhe/build.rs
-              - backends/tfhe-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
-              - tfhe/src/high_level_api/**
-              - '.github/workflows/gpu_memory_sanitizer.yml'
-
  setup-instance:
    name: gpu_memory_sanitizer/setup-instance
-    needs: should-run
    runs-on: ubuntu-latest
-    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    if: github.event_name != 'pull_request' ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -31,50 +31,18 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  should-run:
-    name: gpu_memory_sanitizer_h100/should-run
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
-    outputs:
-      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
-        with:
-          files_yaml: |
-            gpu:
-              - Cargo.toml
-              - tfhe/Cargo.toml
-              - tfhe/build.rs
-              - backends/tfhe-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
-              - tfhe/src/high_level_api/**
-              - '.github/workflows/gpu_memory_sanitizer_h100.yml'
-
  setup-instance:
    name: gpu_memory_sanitizer/setup-instance
-    needs: should-run
    runs-on: ubuntu-latest
-    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    if: github.event_name != 'pull_request' ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

-      - name: Install CUDA and other dependencies
+      - name: Install CUDA
        if: env.SECRETS_AVAILABLE == 'false'
        shell: bash
        run: |
@@ -90,12 +90,6 @@ jobs:
          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
          sha256sum -c checksum
          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
-
-          # Disable unattended-upgrades to avoid lock issues
-          sudo systemctl disable --now unattended-upgrades
-
-          sudo apt-get clean
-          sudo rm -rf /var/lib/apt/lists/*
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
        env:
@@ -136,17 +130,6 @@ jobs:
        run: |
          make pcc_gpu

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          # Disable unattended-upgrades to avoid lock issues
-          sudo systemctl disable --now unattended-upgrades
-
-          sudo apt-get clean
-          sudo rm -rf /var/lib/apt/lists/*
-          sudo apt update
-          sudo apt -y install python3-venv
-          make semgrep_and_lint_gpu_code
-
      - name: Check build with hpu enabled
        run: |
          make clippy_gpu_hpu
@@ -176,7 +159,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -66,6 +66,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_signed_integer_classic_tests/setup-instance
@@ -80,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -66,6 +66,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
@@ -86,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -183,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -67,6 +67,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_signed_integer_tests/setup-instance
@@ -81,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +179,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -66,6 +66,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_unsigned_integer_classic_tests/setup-instance
@@ -80,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -66,6 +66,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
@@ -86,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -183,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -67,6 +67,7 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh
+              - ci/slab.toml

  setup-instance:
    name: gpu_unsigned_integer_tests/setup-instance
@@ -81,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +179,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -13,7 +13,9 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -50,13 +52,39 @@ jobs:
              - backends/tfhe-hpu-backend/**
              - mockups/tfhe-hpu-mockup/**

-  cargo-tests-hpu:
-    name: hpu_hlapi_tests/cargo-tests-hpu (bpr)
+  setup-instance:
+    name: hpu_hlapi_tests/setup-instance
    needs: should-run
    if:
      needs.should-run.outputs.hpu_test == 'true' &&
      ((github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||github.event_name == 'pull_request')
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cargo-tests-hpu:
+    name: hpu_hlapi_tests/cargo-tests-hpu (bpr)
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
@@ -79,3 +107,27 @@ jobs:
          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
          make HPU_CONFIG=sim test_high_level_api_hpu
          make HPU_CONFIG=sim test_user_doc_hpu
+
+  teardown-instance:
+    name: hpu_hlapi_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [setup-instance, cargo-tests-hpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (hpu_hlapi_tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -24,14 +24,32 @@ permissions: {}
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  cpu-tests:
-    name: integer_long_run_tests/cpu-tests
+  setup-instance:
+    name: integer_long_run_tests/setup-instance
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  cpu-tests:
+    name: integer_long_run_tests/cpu-tests
+    needs: [ setup-instance ]
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
      cancel-in-progress: true
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
@@ -56,3 +74,26 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU long run tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: integer_long_run_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, cpu-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -222,7 +222,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -92,7 +92,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
+        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
+        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -5,8 +5,9 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-
-  SAGEMATH_VERSION: 10.8
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  pull_request:
@@ -25,12 +26,38 @@ permissions: {}
 # zizmor: ignore[concurrency-limits] only Zama organization members and GitHub can trigger this workflow

 jobs:
-  params-curves-security-check:
-    name: parameters_check/params-curves-security-check
+  setup-instance:
+    name: parameters_check/setup-instance
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
      github.event_name != 'push'
-    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  params-curves-security-check:
+    name: parameters_check/params-curves-security-check
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -47,39 +74,14 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: malb/lattice-estimator
-          path: lattice-estimator
+          path: lattice_estimator
          ref: '352ddaf4a288a0543f5d9eb588d2f89c7acec463'
          persist-credentials: 'false'

-      - name: Restore Sagemath image from cache
-        id: docker-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        with:
-          path: /tmp/sagemath_image
-          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
-          restore-keys: sagemath-image-
-
-      - name: Load cached Docker sagemath image
-        if: steps.docker-cache.outputs.cache-hit == 'true'
+      - name: Install Sage
        run: |
-          docker load -i /tmp/sagemath_image/sagemath.tar
-
-      - name: Pull Docker sagemath image
-        if: steps.docker-cache.outputs.cache-hit != 'true'
-        run: |
-          docker pull sagemath/sagemath:"${VERSION}"
-          mkdir -p /tmp/sagemath_image
-          docker save sagemath/sagemath:"${VERSION}" -o /tmp/sagemath_image/sagemath.tar
-        env:
-          VERSION: ${{ env.SAGEMATH_VERSION }}
-
-      - name: Store Sagemath image in cache
-        if: steps.docker-cache.outputs.cache-hit != 'true'
-        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        with:
-          path: /tmp/sagemath_image
-          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
+          sudo apt update
+          sudo apt install -y sagemath

      - name: Collect parameters
        run: |
@@ -93,9 +95,7 @@ jobs:

      - name: Perform security check
        run: |
-          docker run \
-          -v "${PWD}":/repo_src \
-          sagemath/sagemath:10.8 /bin/bash /repo_src/scripts/execute_lattice_estimator.sh
+          PYTHONPATH=lattice_estimator sage ci/lattice_estimator.sage

      - name: Get time elapsed
        if: ${{ always() }}
@@ -127,3 +127,27 @@ jobs:
          SLACK_MESSAGE: "Security check for parameters finished with status: ${{ job.status }} (analysis took: ${{ env.TIME_ELAPSED }} mins). (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: parameters_check/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [setup-instance, params-curves-security-check]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (params-curves-security-check) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/placeholder_workflow.yml
+++ b/.github/workflows/placeholder_workflow.yml
@@ -1,18 +1,179 @@
 # Placeholder workflow file allowing running it without having to merge to main first
 name: placeholder_workflow

+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+
 on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]

-permissions: {}
+permissions:
+  contents: read

-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  placeholder:
-    name: placeholder_workflow/placeholder
+  should-run:
+    name: aws_tfhe_param_prod_tests/should-run
+    if: (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
-
+    permissions:
+      pull-requests: read # Needed to check for file change
+    outputs:
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
-      - run: |
-          echo "Hello this is a Placeholder Workflow"
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
+        with:
+          files_yaml: |
+            dependencies:
+              - tfhe/Cargo.toml
+              - tfhe-csprng/**
+              - tfhe-fft/**
+              - tfhe-zk-pok/**
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            versionable:
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' )
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
+
+  setup-instance:
+    name: aws_tfhe_param_prod_tests/setup-instance
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  param-prod-tests:
+    name: aws_tfhe_param_prod_tests/param-prod-tests
+    needs: [ should-run, setup-instance ]
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true'
+        run: |
+          make gen_key_cache
+
+      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
+        run: |
+          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=FALSE make test_param_prod_shortint_ci
+      - name: Set pull-request URL
+        if: ${{ failure() && github.event_name == 'pull_request' }}
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Slack Notification
+        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_param_prod_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, param-prod-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (param-prod-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -18,7 +18,7 @@ jobs:
      issues: read # Needed to fetch all issues
      pull-requests: write # Needed to write message and close the PR
    steps:
-      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
        with:
          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
          days-before-stale: 2
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
-backends/tfhe-cuda-backend/cuda/build/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -9,7 +9,7 @@ ignore:
  - tfhe/web_wasm_parallel_tests/dist
  - keys
  - coverage
-  - utils/tfhe-lints/tests/*/main.stderr
+  - utils/tfhe-lints/ui/main.stderr
  - utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated

 rules:
--- a/28
+++ b/28
@@ -2,37 +2,35 @@
 # i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file

-/backends/tfhe-cuda-backend/            @zama-ai/gpu
-/backends/zk-cuda-backend/              @zama-ai/gpu
+/backends/tfhe-cuda-backend/            @agnesLeroy
 /backends/tfhe-hpu-backend/             @zama-ai/hardware

 /tfhe/examples/hpu                      @zama-ai/hardware

-/tfhe/src/core_crypto/                  @IceTDrinker @mayeul-zama
-/tfhe/src/core_crypto/gpu               @zama-ai/gpu
+/tfhe/src/core_crypto/                  @IceTDrinker
+/tfhe/src/core_crypto/gpu               @agnesLeroy
 /tfhe/src/core_crypto/hpu               @zama-ai/hardware

 /tfhe/src/shortint/                     @mayeul-zama @nsarlin-zama

-/tfhe/src/integer/                      @tmontaigu @nsarlin-zama
-/tfhe/src/integer/gpu                   @zama-ai/gpu
+/tfhe/src/integer/                      @tmontaigu
+/tfhe/src/integer/gpu                   @agnesLeroy
 /tfhe/src/integer/hpu                   @zama-ai/hardware

-/tfhe/src/high_level_api/               @tmontaigu @nsarlin-zama
+/tfhe/src/high_level_api/               @tmontaigu

-/tfhe-zk-pok/                           @nsarlin-zama @tmontaigu
-/tfhe-zk-pok/src/gpu                    @zama-ai/gpu
+/tfhe-zk-pok/                           @nsarlin-zama

-/tfhe-benchmark/                        @soonum @SouchonTheo
+/tfhe-benchmark/                        @soonum

-/utils/                                 @nsarlin-zama @SouchonTheo
+/utils/                                 @nsarlin-zama

 /Makefile                               @IceTDrinker @soonum

 /mockups/tfhe-hpu-mockup                @zama-ai/hardware

-/.github/                               @soonum @SouchonTheo
-/ci/                                    @soonum @SouchonTheo
-/scripts/                               @soonum @SouchonTheo
+/.github/                               @soonum
+/ci/                                    @soonum
+/scripts/                               @soonum

-/CODEOWNERS                             @IceTDrinker @nsarlin-zama
+/CODEOWNERS                             @IceTDrinker
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,7 +17,7 @@ Start by [forking](https://docs.github.com/en/pull-requests/collaborating-with-p
 - **Performance**: For optimal performance, it is highly recommended to run **TFHE-rs** code in release mode with cargo's `--release` flag.
 {% endhint %}

-To get more details about the library, please refer to the [documentation](https://docs.zama.org/tfhe-rs).
+To get more details about the library, please refer to the [documentation](https://docs.zama.ai/tfhe-rs).

 ## 2. Creating a new branch

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,12 +9,10 @@ members = [
    "tasks",
    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
-    "backends/zk-cuda-backend",
    "backends/tfhe-hpu-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/tfhe-backward-compat-data",
-    "utils/tfhe-backward-compat-data/crates/add_new_version",
    "utils/param_dedup",
    "tests",
    "mockups/tfhe-hpu-mockup",
@@ -28,8 +26,6 @@ rust-version = "1.91.1"

 [workspace.dependencies]
 aligned-vec = { version = "0.6", default-features = false }
-ark-ec = "0.5.0"
-ark-ff = "0.5.0"
 bytemuck = "1.24"
 dyn-stack = { version = "0.13", default-features = false }
 itertools = "0.14"
@@ -41,11 +37,7 @@ serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.101"
 getrandom = "0.2.8"
 # The project maintainers consider that this is the last version of the 1.3 branch, any newer version should not be trusted
-bindgen = "0.71"
 bincode = "=1.3.3"
-cmake = "0.1"
-pkg-config = "0.3"
-clap = { version = "4.5", features = ["derive"] }

 [profile.bench]
 lto = "fat"
--- a/269
+++ b/269
@@ -1,7 +1,4 @@
 SHELL:=$(shell /usr/bin/env which bash)
-# Enable stop on error, no undefined variables
-# the c flag is to run the script inline
-.SHELLFLAGS := -eu -c
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat nightly-toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
@@ -58,9 +55,6 @@ REGEX_PATTERN?=''
 # tfhe-cuda-backend
 TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
-ZKCUDA_SRC=backends/zk-cuda-backend/cuda
-ZKCUDA_BUILD=$(ZKCUDA_SRC)/build
-ZKCUDARS_SRC=backends/zk-cuda-backend/src

 # tfhe-hpu-backend
 HPU_CONFIG=v80
@@ -270,23 +264,12 @@ install_mlc:
 	cargo install mlc --locked || \
 	( echo "Unable to install mlc, unknown error." && exit 1 )

-fmt: FMT_CHECK =
 .PHONY: fmt # Format rust code
-fmt: fmt_internal
-
-check_fmt: FMT_CHECK = --check
-.PHONY: check_fmt # Check rust code format
-check_fmt: fmt_internal
-
-.PHONY: fmt_internal # internal recipe for fmt
-fmt_internal: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt $(FMT_CHECK)
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt $(FMT_CHECK)
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt $(FMT_CHECK)
-	for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
-		echo "fmt $$crate"; \
-		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate fmt $(FMT_CHECK); \
-	done
+fmt: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -296,24 +279,10 @@ fmt_js: check_nvm_installed
 	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt && \
 	$(MAKE) -C tfhe/js_on_wasm_tests fmt

-.PHONY: semgrep_lint_setup_venv # Create venv and install Python dependencies for GPU lint checks
-semgrep_lint_setup_venv:
-	python3 -m venv venv
-	venv/bin/pip install -r scripts/gpu-lint-requirements.txt
-
-.PHONY: semgrep_and_lint_gpu_code # Run semgrep and lint checks on CUDA backend code
-semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
-	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
-		| grep -v '/cmake-build-debug/' \
-		| grep -v '/build/' \
-		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
-	venv/bin/python3 "scripts/check_scratch_cleanup.py"
-
 .PHONY: fmt_gpu # Format rust and cuda code
 fmt_gpu: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh
-	cd "$(ZKCUDA_SRC)" && ./format_zk_cuda_backend.sh

 .PHONY: fmt_c_tests # Format c tests
 fmt_c_tests:
@@ -323,6 +292,13 @@ fmt_c_tests:
 fmt_toml: install_taplo
 	taplo fmt

+.PHONY: check_fmt # Check rust code format
+check_fmt: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check
+
 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
 	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
@@ -331,7 +307,6 @@ check_fmt_c_tests:
 check_fmt_gpu: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
-	cd "$(ZKCUDA_SRC)" && ./format_zk_cuda_backend.sh -c

 .PHONY: check_fmt_js # Check javascript code format
 check_fmt_js: check_nvm_installed
@@ -353,14 +328,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p tfhe

@@ -374,7 +349,7 @@ clippy_hpu: install_rs_check_toolchain
 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

@@ -467,7 +442,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	fi && \
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
 		-p tfhe -- --nocapture

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -543,10 +518,11 @@ clippy_param_dedup: install_rs_check_toolchain

 .PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
 clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-backward-compat-data -- --no-deps -D warnings
 	@# Some old crates are x86 specific, only run in that case
 	@if uname -a | grep -q x86; then \
+		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
+			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all --all-targets \
+			-- --no-deps -D warnings; \
 		for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
 			echo "checking $$crate"; \
 			RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
@@ -575,8 +551,6 @@ clippy_core clippy_tfhe_csprng
 clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p zk-cuda-backend -- --no-deps -D warnings

 .PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
 clippy_hpu_backend: install_rs_check_toolchain
@@ -670,7 +644,7 @@ build_c_api: install_rs_check_toolchain
 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
 		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -748,28 +722,8 @@ test_cuda_backend:
 		"$(MAKE)" -j "$(CPU_COUNT)" && \
 		"$(MAKE)" test

-.PHONY: test_cuda_backend_race_check # Build and run selected CUDA backend tests with Compute Sanitizer racecheck
-test_cuda_backend_race_check:
-	mkdir -p "$(TFHECUDA_BUILD)" && \
-		cd "$(TFHECUDA_BUILD)" && \
-		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
-		"$(MAKE)" -j "$(CPU_COUNT)" test_tfhe_cuda_backend && \
-		compute-sanitizer --tool racecheck --target-processes all ./tests_and_benchmarks/tests/test_tfhe_cuda_backend \
-			--gtest_filter="*ClassicalProgrammableBootstrap*:*MultiBitProgrammableBootstrap*"
-
-.PHONY: test_zk_cuda_backend # Run the internal tests of the CUDA ZK backend
-test_zk_cuda_backend:
-	mkdir -p "$(ZKCUDA_BUILD)" && \
-		cd "$(ZKCUDA_BUILD)" && \
-		cmake .. -DCMAKE_BUILD_TYPE=Release -DZK_CUDA_BACKEND_BUILD_TESTS=ON && \
-		"$(MAKE)" -j "$(CPU_COUNT)" && \
-		"$(MAKE)" test
-	cd "$(ZKCUDARS_SRC)" && \
-		cargo test --release
-
-
 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu:
@@ -809,7 +763,7 @@ test_integer_hl_test_gpu_check_warnings:
 		--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p tfhe &> /tmp/gpu_compile_output
 	WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning #" | grep "\[tfhe-cuda-backend" | grep -v "inline qualifier" || true) && \
 	if [[ "$${WARNINGS}" != "" ]]; then \
-		echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
+	    echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
 		echo "$${WARNINGS}" && exit 1; \
 	fi

@@ -966,6 +920,14 @@ test_shortint_ci: install_cargo_nextest
 		./scripts/shortint-tests.sh \
 		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "tfhe"

+.PHONY: test_param_prod_shortint_ci # Run the tests for shortint ci
+test_param_prod_shortint_ci: install_cargo_nextest
+	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
+	FAST_TESTS="$(FAST_TESTS)" \
+		./scripts/shortint-tests.sh \
+		--cargo-profile "$(CARGO_PROFILE)" --run-prod-only --tfhe-package "tfhe"
+
+
 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -1205,31 +1167,12 @@ test_tfhe_csprng_big_endian: install_cargo_cross
 	RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu

+
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok --features experimental

-.PHONY: test_zk_pok_gpu # Run tfhe-zk-pok GPU-accelerated tests
-test_zk_pok_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
-
-.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
-test_integer_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
-		--features=integer,zk-pok,gpu -p tfhe -- \
-		integer::gpu::zk::
-
-.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
-test_integer_zk_experimental_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
-		--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
-		integer::gpu::zk::
-
-.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
-test_zk_cuda: install_rs_check_toolchain test_zk_cuda_backend test_zk_pok_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
-
 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
@@ -1503,14 +1446,14 @@ bench_integer_hpu: install_rs_check_toolchain

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1522,47 +1465,26 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain
 	--bench	glwe_packing_compression_128b-integer-bench \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_msm_zk
-bench_msm_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench zk-msm \
-	--features=zk-pok -p tfhe-benchmark --profile release --
-
-.PHONY: bench_msm_zk_gpu
-bench_msm_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench zk-msm \
-	--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release --
-
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
-
-.PHONY: bench_integer_zk_experimental_gpu
-bench_integer_zk_experimental_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
 bench_integer_aes_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes \
-	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
 bench_integer_aes256_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes256 \
-	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
 bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1612,7 +1534,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1799,14 +1721,14 @@ bench_hlapi_erc20: install_rs_check_toolchain
 .PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
 bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+    cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
 bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+    cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

@@ -1845,13 +1767,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

-.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
-bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--package tfhe-zk-pok \
-	--features=gpu-experimental --profile release
-
 .PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
 bench_hlapi_noise_squash: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
@@ -1866,108 +1781,6 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_kvstore # Run benchmarks for Key-Value Store operations
-bench_hlapi_kvstore: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-kvstore \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
-
-.PHONY: bench_summary # Run summary benchmarks
-bench_summary: install_rs_check_toolchain
-	# Arithmetic operations: addition, multiplication, division, comparison
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi_unsigned \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::add|::mul|::gt|::div_rem'
-
-	# Noise squash
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::noise_squash::'
-
-	# Noise squash and compression
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'
-
-	# ERC20
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
-	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
-
-	# DEX
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-dex \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::no_cmux::'
-
-	# ZK
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-zk-pke \
-	--features=integer,internal-keycache,zk-pok,pbs-stats \
-	-p tfhe-benchmark --
-
-	# Compression
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-glwe_packing_compression \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
-
-.PHONY: bench_summary_gpu # Run summary benchmarks on GPU
-bench_summary_gpu: install_rs_check_toolchain
-	# Arithmetic operations: addition, multiplication, division, comparison
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=FAST_DEFAULT __TFHE_RS_BENCH_BIT_SIZES_SET=FAST __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer \
-	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::add|::mul|::gt|::div_rem'
-
-	# Noise squash
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::noise_squash::'
-
-	# Noise squash and compression
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
-
-	# ERC20
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
-	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
-
-	# DEX
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE)  __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-dex \
-	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::no_cmux::'
-
-	# ZK
-	# Proof is done on CPU node of the instance
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-zk-pke \
-	--features=integer,internal-keycache,zk-pok,pbs-stats \
-	-p tfhe-benchmark -- '::pke_zk_proof'
-	# Verify is done on GPUs
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --
-
-	# Compression
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-glwe_packing_compression \
-	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_custom # Run benchmarks with a user-defined command
 bench_custom: install_rs_check_toolchain
@@ -2094,7 +1907,7 @@ pcc_batch_1:
 pcc_batch_2:
 	$(call run_recipe_with_details,clippy)
 	$(call run_recipe_with_details,clippy_all_targets)
-	$(call run_recipe_with_details,check_fmt_js)  # This needs to stay there, CI pipeline rely on this recipe to conditionally install Node
+	$(call run_recipe_with_details,check_fmt_js)
 	$(call run_recipe_with_details,clippy_test_vectors)
 	$(call run_recipe_with_details,check_test_vectors)

--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <hr/>

 <p align="center">
-  <a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.org/tfhe-rs"> 📒 Documentation</a> | <a href="https://www.zama.org/community-channels"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
+  <a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
 </p>


@@ -47,7 +47,7 @@ production-ready library for all the advanced features of TFHE.
 - **Ciphertext and server key compression** for efficient data transfer
 - **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.

-*Learn more about TFHE-rs features in the [documentation](https://docs.zama.org/tfhe-rs).*
+*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
 <br></br>

 ## Table of Contents
@@ -149,7 +149,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performance possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.org/tfhe-rs/get-started/quick-start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -163,25 +163,25 @@ to run in release mode with cargo's `--release` flag to have the best performanc
 A document containing scientific and technical details about algorithms implemented into the library is available here: [TFHE-rs: A (Practical) Handbook](https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf).

 ### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.org/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.org/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.org/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.org/post/tfhe-deep-dive-part-4)
+- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
+- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
+- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
+- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
 <br></br>

 ### Tutorials
- [Video tutorial: Implement signed integers using TFHE-rs](https://www.zama.org/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.org/tfhe-rs/tutorials/parity-bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.org/tfhe-rs/tutorials/ascii-fhe-string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.org/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.org/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.org/post/regex-engine-tfhe-rs)
+- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
+- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity-bit)
+- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii-fhe-string)
+- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
+- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
+- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)

-*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.org/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
+*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
 <br></br>
 ### Documentation

-Full, comprehensive documentation is available here: [https://docs.zama.org/tfhe-rs](https://docs.zama.org/tfhe-rs).
+Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
 <p align="right">
  <a href="#about" > ↑ Back to top </a>
 </p>
@@ -202,7 +202,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
 ### Security model

 By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
-If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.org/tfhe-rs).
+If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).

 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf

@@ -231,7 +231,7 @@ To cite TFHE-rs in academic papers, please use the following entry:
 There are two ways to contribute to TFHE-rs:

 - [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.org](mailto:hello@zama.org).
+- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).

 Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
 <br></br>
@@ -243,16 +243,16 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
 **Is Zama’s technology free to use?**
 >Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
 >
->Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.org/post/open-source).
+>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).

 **What do I need to do if I want to use Zama’s technology for commercial purposes?**
->To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.org for more information.
+>To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.ai for more information.

 **Do you file IP on your technology?**
 >Yes, all Zama’s technologies are patented.

 **Can you customize a solution for my specific use case?**
->We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.org.
+>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -261,7 +261,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi

 ## Support

-<a target="_blank" href="https://community.zama.org">
+<a target="_blank" href="https://community.zama.ai">
 <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -5,16 +5,16 @@ edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
 description = "Cuda implementation of TFHE-rs primitives."
-homepage = "https://www.zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://www.zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

 [build-dependencies]
-cmake.workspace = true
-pkg-config.workspace = true
-bindgen.workspace = true
+cmake = { version = "0.1" }
+pkg-config = { version = "0.3" }
+bindgen = "0.71"

 [features]
 experimental-multi-arch = []
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -51,4 +51,4 @@ If your machine does not have an available Nvidia GPU, the compilation will work
 ## License

 This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.org`.
+please contact us at `hello@zama.ai`.
--- a/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
+++ b/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
@@ -1,64 +0,0 @@
-rules:
-  - id: release-missing-cuda-synchronize
-    message: >-
-      release() method does not call cuda_synchronize_stream or delegate to
-      another release(). All release methods must synchronize the CUDA stream
-      (directly or via delegation) to ensure async GPU operations complete
-      before returning.
-    severity: ERROR
-    languages: [cpp]
-    paths:
-      exclude:
-        - "**/helper_multi_gpu.h"
-    patterns:
-      - pattern: |
-          void release(...) {
-            ...
-          }
-      - pattern-not: |
-          void release(...) {
-            ...
-            cuda_synchronize_stream($S.stream(0), ...);
-            ...
-          }
-      - pattern-not: |
-          void release(cudaStream_t stream, ...) {
-            ...
-            cuda_synchronize_stream(stream, ...);
-            ...
-          }
-      - pattern-not: |
-          void release(...) {
-            ...
-            $MEM->release(...);
-            ...
-          }
-
-
-  - id: cleanup-missing-release-or-synchronize
-    message: >-
-      cleanup_ function does not call release() or cuda_synchronize_stream().
-      All non-async cleanup_ functions must either call release() on a memory
-      structure or synchronize the CUDA stream.
-    severity: ERROR
-    languages: [cpp]
-    patterns:
-      - pattern: |
-          void $FUNC(...) {
-            ...
-          }
-      - metavariable-regex:
-          metavariable: $FUNC
-          regex: ^cleanup_.*(?<!_async)$
-      - pattern-not: |
-          void $FUNC(...) {
-            ...
-            $MEM->release(...);
-            ...
-          }
-      - pattern-not: |
-          void $FUNC(...) {
-            ...
-            cuda_synchronize_stream(...);
-            ...
-          }
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
@@ -3,7 +3,7 @@
 #include "../integer/integer.h"

 extern "C" {
-uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
+uint64_t scratch_cuda_integer_aes_encrypt_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -12,28 +12,18 @@ uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
    uint32_t sbox_parallelism);

-uint64_t scratch_cuda_integer_aes_ctr_256_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism);
+void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
+                                     CudaRadixCiphertextFFI *output,
+                                     CudaRadixCiphertextFFI const *iv,
+                                     CudaRadixCiphertextFFI const *round_keys,
+                                     const uint64_t *counter_bits_le_all_blocks,
+                                     uint32_t num_aes_inputs, int8_t *mem_ptr,
+                                     void *const *bsks, void *const *ksks);

-void cuda_integer_aes_ctr_encrypt_64_async(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *iv, CudaRadixCiphertextFFI const *round_keys,
-    const uint64_t *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
+                                         int8_t **mem_ptr_void);

-void cleanup_cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
-                                             int8_t **mem_ptr_void);
-
-void cleanup_cuda_integer_aes_ctr_256_encrypt_64(CudaStreamsFFI streams,
-                                                 int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_key_expansion_64_async(
+uint64_t scratch_cuda_integer_key_expansion_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -41,22 +31,22 @@ uint64_t scratch_cuda_integer_key_expansion_64_async(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_key_expansion_64_async(CudaStreamsFFI streams,
-                                         CudaRadixCiphertextFFI *expanded_keys,
-                                         CudaRadixCiphertextFFI const *key,
-                                         int8_t *mem_ptr, void *const *bsks,
-                                         void *const *ksks);
+void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *expanded_keys,
+                                   CudaRadixCiphertextFFI const *key,
+                                   int8_t *mem_ptr, void *const *bsks,
+                                   void *const *ksks);

 void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

-void cuda_integer_aes_ctr_256_encrypt_64_async(
+void cuda_integer_aes_ctr_256_encrypt_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *iv, CudaRadixCiphertextFFI const *round_keys,
    const uint64_t *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks);

-uint64_t scratch_cuda_integer_key_expansion_256_64_async(
+uint64_t scratch_cuda_integer_key_expansion_256_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -64,10 +54,11 @@ uint64_t scratch_cuda_integer_key_expansion_256_64_async(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_key_expansion_256_64_async(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *expanded_keys,
-    CudaRadixCiphertextFFI const *key, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+void cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
+                                       CudaRadixCiphertextFFI *expanded_keys,
+                                       CudaRadixCiphertextFFI const *key,
+                                       int8_t *mem_ptr, void *const *bsks,
+                                       void *const *ksks);

 void cleanup_cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
                                               int8_t **mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -34,7 +34,7 @@ template <typename Torus> struct int_aes_lut_buffers {
        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
        params.pbs_type);
    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and_lut, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);

    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

@@ -48,7 +48,7 @@ template <typename Torus> struct int_aes_lut_buffers {
    auto active_streams_flush_lut = streams.active_gpu_subset(
        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush_lut, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -60,7 +60,7 @@ template <typename Torus> struct int_aes_lut_buffers {
    auto active_streams_carry_lut =
        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
    this->carry_lut->generate_and_broadcast_lut(
-        active_streams_carry_lut, {0}, {carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

@@ -179,11 +179,11 @@ template <typename Torus> struct int_aes_counter_workspaces {
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

    this->h_counter_bits_buffer =
-        (Torus *)malloc(safe_mul_sizeof<Torus>(num_aes_inputs));
-    size_tracker += safe_mul_sizeof<Torus>(num_aes_inputs);
+        (Torus *)malloc(num_aes_inputs * sizeof(Torus));
+    size_tracker += num_aes_inputs * sizeof(Torus);
    this->d_counter_bits_buffer = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_aes_inputs), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        num_aes_inputs * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);
  }

  void release(CudaStreams streams, bool allocate_gpu_memory) {
--- a/backends/tfhe-cuda-backend/cuda/include/checked_arithmetic.h
+++ b/backends/tfhe-cuda-backend/cuda/include/checked_arithmetic.h
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdio>
-
-#include "device.h"
-
-// Variadic checked multiplication of size_t values.
-// Folds left-to-right using __builtin_mul_overflow, returning true on overflow.
-// On overflow the value written to *out is unspecified.
-template <typename... Args>
-inline bool checked_mul(size_t *out, size_t first, Args... rest) {
-  size_t result = first;
-  for (size_t value : {static_cast<size_t>(rest)...}) {
-    if (__builtin_mul_overflow(result, value, &result))
-      return true;
-  }
-  *out = result;
-  return false;
-}
-
-// Variadic safe multiplication: computes the product and panics on overflow.
-template <typename... Args> inline size_t safe_mul(size_t first, Args... rest) {
-  size_t result;
-  bool overflow = checked_mul(&result, first, rest...);
-  PANIC_IF_FALSE(!overflow, "multiplication overflow wraps size_t");
-  return result;
-}
-
-// Variadic safe multiplication with an appended sizeof(T) factor.
-// Computes (args... * sizeof(T)) with overflow checking.
-template <typename T, typename... Args>
-inline size_t safe_mul_sizeof(Args... args) {
-  return safe_mul(args..., sizeof(T));
-}
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -5,36 +5,39 @@

 extern "C" {

-void cuda_convert_lwe_ciphertext_vector_to_gpu_64_async(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t number_of_cts, uint32_t lwe_dimension);
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64_async(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t number_of_cts, uint32_t lwe_dimension);
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
+                                                  uint32_t gpu_index,
+                                                  void *dest, void const *src,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension);
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
+                                                  uint32_t gpu_index,
+                                                  void *dest, void const *src,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension);

-void cuda_glwe_sample_extract_64_async(
+void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
+                                 void *lwe_array_out, void const *glwe_array_in,
+                                 uint32_t const *nth_array, uint32_t num_nths,
+                                 uint32_t lwe_per_glwe, uint32_t glwe_dimension,
+                                 uint32_t polynomial_size);
+
+void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
+                                    void *lwe_array_out, uint32_t size,
+                                    uint32_t log_modulus);
+
+void cuda_modulus_switch_64(void *stream, uint32_t gpu_index, void *lwe_out,
+                            const void *lwe_in, uint32_t size,
+                            uint32_t log_modulus);
+
+void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
+                                     void *lwe_out, const void *lwe_in,
+                                     uint32_t lwe_dimension,
+                                     uint32_t log_modulus);
+
+void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
-    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
-    uint32_t glwe_dimension, uint32_t polynomial_size);
-
-void cuda_modulus_switch_inplace_64_async(void *stream, uint32_t gpu_index,
-                                          void *lwe_array_out, uint32_t size,
-                                          uint32_t log_modulus);
-
-void cuda_modulus_switch_64_async(void *stream, uint32_t gpu_index,
-                                  void *lwe_out, const void *lwe_in,
-                                  uint32_t size, uint32_t log_modulus);
-
-void cuda_centered_modulus_switch_64_async(void *stream, uint32_t gpu_index,
-                                           void *lwe_out, const void *lwe_in,
-                                           uint32_t lwe_dimension,
-                                           uint32_t log_modulus);
-
-void cuda_glwe_sample_extract_128_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
-    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
-    uint32_t glwe_dimension, uint32_t polynomial_size);
+    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -132,8 +132,6 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);

 uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);

-uint32_t cuda_get_max_shared_memory_per_block(uint32_t gpu_index);
-
 bool cuda_check_support_cooperative_groups();

 bool cuda_check_support_thread_block_clusters();
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -150,9 +150,9 @@ public:
        _gpu_count(src._gpu_count), _owns_streams(false) {}

  CudaStreams &operator=(CudaStreams const &other) {
-    /*    PANIC_IF_FALSE(this->_streams == nullptr ||
-                           this->_streams == other._streams,
-                       "Assigning an already initialized CudaStreams");*/
+    PANIC_IF_FALSE(this->_streams == nullptr ||
+                       this->_streams == other._streams,
+                   "Assigning an already initialized CudaStreams");
    this->_streams = other._streams;
    this->_gpu_indexes = other._gpu_indexes;
    this->_gpu_count = other._gpu_count;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -47,7 +47,7 @@ template <typename Torus> struct boolean_bitop_buffer {
        // only lut for degree = 1 is generated
        lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
                                                  {lut_bivariate_f},
-                                                  LUT_0_FOR_ALL_BLOCKS, {}, 2);
+                                                  gpu_memory_allocated, {}, 2);
      }
      break;
    default:
@@ -63,7 +63,7 @@ template <typename Torus> struct boolean_bitop_buffer {
      };

      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -134,7 +134,7 @@ template <typename Torus> struct int_bitop_buffer {
        };

        lut->generate_and_broadcast_bivariate_lut(
-            active_streams, {0}, {lut_bivariate_f}, LUT_0_FOR_ALL_BLOCKS);
+            active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
      }
      break;
    default:
@@ -166,7 +166,7 @@ template <typename Torus> struct int_bitop_buffer {
      }

      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
-                                      LUT_0_FOR_ALL_BLOCKS);
+                                      gpu_memory_allocated);
    }
  }

@@ -203,7 +203,7 @@ template <typename Torus> struct boolean_bitnot_buffer {
          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);

      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -37,7 +37,7 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
          }},
-          LUT_0_FOR_ALL_BLOCKS);
+          allocate_gpu_memory);

      this->last_block = new CudaRadixCiphertextFFI;

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -85,28 +85,30 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

+    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
+    for (int index = 0; index < 2 * num_radix_blocks; index++) {
+      if (index < num_radix_blocks) {
+        h_lut_indexes[index] = 0;
+      } else {
+        h_lut_indexes[index] = 1;
+      }
+    }
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
+        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
+        streams.gpu_index(0), allocate_gpu_memory);
+
    auto active_streams_pred =
        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    auto lut_index_generator = [num_radix_blocks](Torus *h_lut_indexes,
-                                                  uint32_t num_indexes) {
-      for (int index = 0; index < 2 * num_radix_blocks; index++) {
-        if (index < num_radix_blocks) {
-          h_lut_indexes[index] = 0;
-        } else {
-          h_lut_indexes[index] = 1;
-        }
-      }
-    };
-
    predicate_lut->generate_and_broadcast_bivariate_lut(
        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
-        lut_index_generator);
+        gpu_memory_allocated);

    auto active_streams_msg =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    message_extract_lut->generate_and_broadcast_lut(
-        active_streams_msg, {0}, {message_extract_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -1,5 +1,4 @@
 #pragma once
-#include "checked_arithmetic.h"
 #include "cmux.h"
 #include "integer_utilities.h"

@@ -40,8 +39,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
-        params.glwe_dimension + 1, params.polynomial_size));
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));

    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
                                            allocate_gpu_memory, size_tracker);
@@ -54,7 +53,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    };

    is_max_value->generate_and_broadcast_lut(
-        active_streams, {0}, {is_max_value_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
@@ -106,7 +105,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    is_non_zero_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {is_non_zero_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -136,7 +135,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
    }

    scalar_comparison_luts->generate_and_broadcast_lut(
-        active_streams, lut_indices, lut_funcs, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);

    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
@@ -144,7 +143,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
                                   allocate_gpu_memory, size_tracker);

      operator_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {operator_f}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {operator_f}, gpu_memory_allocated);
    } else {
      operator_lut = nullptr;
    }
@@ -215,8 +214,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
-        params.glwe_dimension + 1, params.polynomial_size));
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));

    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -228,7 +227,7 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {block_selector_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
@@ -308,10 +307,10 @@ template <typename Torus> struct int_comparison_diff_buffer {
    reduce_signs_lut =
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);
-    preallocated_h_lut1 = (Torus *)malloc(safe_mul_sizeof<Torus>(
-        params.glwe_dimension + 1, params.polynomial_size));
-    preallocated_h_lut2 = (Torus *)malloc(safe_mul_sizeof<Torus>(
-        params.glwe_dimension + 1, params.polynomial_size));
+    preallocated_h_lut1 = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    preallocated_h_lut2 = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
  }

  void release(CudaStreams streams) {
@@ -413,7 +412,7 @@ template <typename Torus> struct int_comparison_buffer {
                                 allocate_gpu_memory, size_tracker);

    identity_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {identity_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -424,7 +423,7 @@ template <typename Torus> struct int_comparison_buffer {
                                           allocate_gpu_memory, size_tracker);

    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
-                                            LUT_0_FOR_ALL_BLOCKS);
+                                            gpu_memory_allocated);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -501,10 +500,10 @@ template <typename Torus> struct int_comparison_buffer {

      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      signed_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {signed_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
    }
-    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
-        params.glwe_dimension + 1, params.polynomial_size));
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -5,15 +5,14 @@
 #include "../integer.h"

 extern "C" {
-uint64_t scratch_cuda_integer_compress_radix_ciphertext_64_async(
+uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t num_lwes_stored_per_glwe,
-    bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);

-uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
+uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
@@ -22,12 +21,12 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_compress_radix_ciphertext_64_async(
+void cuda_integer_compress_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
    int8_t *mem_ptr);

-void cuda_integer_decompress_radix_ciphertext_64_async(
+void cuda_integer_decompress_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_in,
    uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
@@ -38,26 +37,25 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
                                                         int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_compress_radix_ciphertext_128_async(
+uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t num_lwes_stored_per_glwe,
-    bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);

-uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128_async(
+uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, bool allocate_gpu_memory);

-void cuda_integer_compress_radix_ciphertext_128_async(
+void cuda_integer_compress_radix_ciphertext_128(
    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
    int8_t *mem_ptr);

-void cuda_integer_decompress_radix_ciphertext_128_async(
+void cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_in,
    uint32_t const *indexes_array, int8_t *mem_ptr);
@@ -68,12 +66,12 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

-void cuda_integer_extract_glwe_128_async(
+void cuda_integer_extract_glwe_128(
    CudaStreamsFFI streams, void *glwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_list,
    uint32_t const glwe_index);

-void cuda_integer_extract_glwe_64_async(
+void cuda_integer_extract_glwe_64(
    CudaStreamsFFI streams, void *glwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_list,
    uint32_t const glwe_index);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -10,37 +10,35 @@ template <typename Torus> struct int_compression {
  Torus *tmp_lwe;
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
-  uint32_t num_lwes_stored_per_glwe;
+  uint32_t lwe_per_glwe;
  uint32_t max_num_glwes;

  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
-  // compress num_lwes_stored_per_glwe: max LWEs packed per GLWE (<=
-  // polynomial_size), defined by the chosen parameter set
+  // compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
+  // defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
-                  uint32_t num_radix_blocks, uint32_t num_lwes_stored_per_glwe,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
-    this->num_lwes_stored_per_glwe = num_lwes_stored_per_glwe;
+    this->lwe_per_glwe = lwe_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

    // Calculate the actual number of GLWEs needed based on total radix blocks.
    // This ensures we allocate enough memory when num_radix_blocks >
-    // num_lwes_stored_per_glwe.
-    max_num_glwes = CEIL_DIV(num_radix_blocks, num_lwes_stored_per_glwe);
+    // lwe_per_glwe.
+    max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);

    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(
-            (size_t)num_radix_blocks,
-            (size_t)(compression_params.small_lwe_dimension + 1)),
+        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
+            sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<Torus>((size_t)max_num_glwes,
-                                   glwe_accumulator_size),
+            max_num_glwes * glwe_accumulator_size * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

@@ -53,21 +51,12 @@ template <typename Torus> struct int_compression {
  void release(CudaStreams streams) {
    cuda_drop_with_size_tracking_async(
        tmp_lwe, streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    tmp_lwe = nullptr;
-
    cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    tmp_glwe_array_out = nullptr;
-
-    if constexpr (sizeof(Torus) == 8)
-      cleanup_cuda_packing_keyswitch_lwe_list_to_glwe_64(
-          streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
-          gpu_memory_allocated);
-    else
-      cleanup_cuda_packing_keyswitch_lwe_list_to_glwe_128(
-          streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
-          gpu_memory_allocated);
+    cleanup_packing_keyswitch_lwe_list_to_glwe(
+        streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
+        gpu_memory_allocated);
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };
@@ -100,17 +89,14 @@ template <typename Torus> struct int_decompression {
                                     1);

    tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>((size_t)num_blocks_to_decompress,
-                               glwe_accumulator_size),
+        num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);
    tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<uint32_t>((size_t)num_blocks_to_decompress),
-        streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
+        num_blocks_to_decompress * sizeof(uint32_t), streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>((size_t)num_blocks_to_decompress,
-                               lwe_accumulator_size),
+        num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

@@ -136,24 +122,20 @@ template <typename Torus> struct int_decompression {
          active_streams, {0}, {decompression_rescale_f},
          effective_compression_message_modulus,
          effective_compression_carry_modulus,
-          encryption_params.message_modulus, encryption_params.carry_modulus);
+          encryption_params.message_modulus, encryption_params.carry_modulus,
+          gpu_memory_allocated);
    }
  }
  void release(CudaStreams streams) {
    cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-
-    tmp_extracted_glwe = nullptr;
    cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    tmp_extracted_lwe = nullptr;
    cuda_drop_with_size_tracking_async(tmp_indexes_array, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    tmp_indexes_array = nullptr;
-
    if constexpr (std::is_same_v<Torus, uint64_t>) {
      decompression_rescale_lut->release(streams);
      delete decompression_rescale_lut;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -285,7 +285,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    for (int j = 0; j < 2; j++) {
      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
                                          {0}, {zero_out_if_not_1_lut_f},
-                                          LUT_0_FOR_ALL_BLOCKS);
+                                          gpu_memory_allocated);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -295,7 +295,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    for (int j = 0; j < 2; j++) {
      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
                                          {0}, {zero_out_if_not_2_lut_f},
-                                          LUT_0_FOR_ALL_BLOCKS);
+                                          gpu_memory_allocated);
    }

    quotient_lut_1 =
@@ -316,11 +316,11 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

    quotient_lut_1->generate_and_broadcast_lut(
-        streams.get_ith(2), {0}, {quotient_lut_1_f}, LUT_0_FOR_ALL_BLOCKS);
+        streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
    quotient_lut_2->generate_and_broadcast_lut(
-        streams.get_ith(1), {0}, {quotient_lut_2_f}, LUT_0_FOR_ALL_BLOCKS);
+        streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
    quotient_lut_3->generate_and_broadcast_lut(
-        streams.get_ith(0), {0}, {quotient_lut_3_f}, LUT_0_FOR_ALL_BLOCKS);
+        streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -340,7 +340,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {

    for (int j = 0; j < 2; j++) {
      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }
  }

@@ -433,31 +433,30 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
      Torus ***second_indexes_ptr, Torus ***scalars_ptr, uint32_t num_blocks,
      bool allocate_gpu_memory, uint64_t &size_tracker) {

-    auto first_indexes = (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
-    auto second_indexes =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
-    auto scalars = (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+    auto first_indexes = (Torus **)malloc(num_blocks * sizeof(Torus *));
+    auto second_indexes = (Torus **)malloc(num_blocks * sizeof(Torus *));
+    auto scalars = (Torus **)malloc(num_blocks * sizeof(Torus *));

    for (int nb = 1; nb <= num_blocks; nb++) {
      first_indexes[nb - 1] = (Torus *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<Torus>(nb), stream, gpu_index, size_tracker,
+          nb * sizeof(Torus), stream, gpu_index, size_tracker,
          allocate_gpu_memory);
      second_indexes[nb - 1] = (Torus *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<Torus>(nb), stream, gpu_index, size_tracker,
+          nb * sizeof(Torus), stream, gpu_index, size_tracker,
          allocate_gpu_memory);
      scalars[nb - 1] = (Torus *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<Torus>(nb), stream, gpu_index, size_tracker,
+          nb * sizeof(Torus), stream, gpu_index, size_tracker,
          allocate_gpu_memory);

      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          first_indexes[nb - 1], first_indexes_for_overflow_sub_gpu_0[nb - 1],
-          safe_mul_sizeof<Torus>(nb), stream, gpu_index, allocate_gpu_memory);
+          nb * sizeof(Torus), stream, gpu_index, allocate_gpu_memory);
      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          second_indexes[nb - 1], second_indexes_for_overflow_sub_gpu_0[nb - 1],
-          safe_mul_sizeof<Torus>(nb), stream, gpu_index, allocate_gpu_memory);
+          nb * sizeof(Torus), stream, gpu_index, allocate_gpu_memory);
      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          scalars[nb - 1], scalars_for_overflow_sub_gpu_0[nb - 1],
-          safe_mul_sizeof<Torus>(nb), stream, gpu_index, allocate_gpu_memory);
+          nb * sizeof(Torus), stream, gpu_index, allocate_gpu_memory);
      *first_indexes_ptr = first_indexes;
      *second_indexes_ptr = second_indexes;
      *scalars_ptr = scalars;
@@ -471,91 +470,72 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    max_indexes_to_erase = num_blocks;

    first_indexes_for_overflow_sub_gpu_0 =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
    second_indexes_for_overflow_sub_gpu_0 =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
    scalars_for_overflow_sub_gpu_0 =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+        (Torus **)malloc(num_blocks * sizeof(Torus *));

-    Torus *h_lut_indexes = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));
-    Torus *h_scalar = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));
+    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));

    // Extra indexes for the luts in first step
    for (int nb = 1; nb <= num_blocks; nb++) {
      first_indexes_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              safe_mul_sizeof<Torus>(nb), streams.stream(0),
-              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-
-      auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
-        for (int index = 0; index < nb; index++) {
-          uint32_t grouping_index = index / group_size;
-          bool is_in_first_grouping = (grouping_index == 0);
-          uint32_t index_in_grouping = index % group_size;
-          bool is_last_index = (index == (nb - 1));
-          if (is_last_index) {
-            if (nb == 1) {
-              h_lut_indexes[index] = 2 * group_size;
-            } else {
-              h_lut_indexes[index] = 2;
-            }
-          } else if (is_in_first_grouping) {
-            h_lut_indexes[index] = index_in_grouping;
+              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+              size_tracker, allocate_gpu_memory);
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+        bool is_last_index = (index == (nb - 1));
+        if (is_last_index) {
+          if (nb == 1) {
+            h_lut_indexes[index] = 2 * group_size;
          } else {
-            h_lut_indexes[index] = index_in_grouping + group_size;
+            h_lut_indexes[index] = 2;
          }
+        } else if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
        }
-      };
-
-      generate_lut_indexes<Torus>(streams, index_generator,
-                                  first_indexes_for_overflow_sub_gpu_0[nb - 1],
-                                  nb, 2 * group_size + 1, h_lut_indexes,
-                                  allocate_gpu_memory);
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          first_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+          allocate_gpu_memory);
    }
    // Extra indexes for the luts in second step
-    uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
-    uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
    for (int nb = 1; nb <= num_blocks; nb++) {
      second_indexes_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              safe_mul_sizeof<Torus>(nb), streams.stream(0),
-              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+              size_tracker, allocate_gpu_memory);
      scalars_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              safe_mul_sizeof<Torus>(nb), streams.stream(0),
-              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-
-      auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
-                                                       uint32_t) {
-        for (int index = 0; index < nb; index++) {
-          uint32_t grouping_index = index / group_size;
-          bool is_in_first_grouping = (grouping_index == 0);
-          uint32_t index_in_grouping = index % group_size;
-
-          if (is_in_first_grouping) {
-            h_lut_indexes[index] = index_in_grouping;
-          } else if (index_in_grouping == (group_size - 1)) {
-            if (use_seq) {
-              int inner_index = (grouping_index - 1) % (group_size - 1);
-              h_lut_indexes[index] = inner_index + 2 * group_size;
-            } else {
-              h_lut_indexes[index] = 2 * group_size;
-            }
-          } else {
-            h_lut_indexes[index] = index_in_grouping + group_size;
-          }
-        }
-      };
-
-      generate_lut_indexes<Torus>(streams, index_generator,
-                                  second_indexes_for_overflow_sub_gpu_0[nb - 1],
-                                  nb, num_luts_second_step, h_lut_indexes,
-                                  allocate_gpu_memory);
+              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+              size_tracker, allocate_gpu_memory);

      for (int index = 0; index < nb; index++) {
        uint32_t grouping_index = index / group_size;
        bool is_in_first_grouping = (grouping_index == 0);
        uint32_t index_in_grouping = index % group_size;
+
+        if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else if (index_in_grouping == (group_size - 1)) {
+          if (use_seq) {
+            int inner_index = (grouping_index - 1) % (group_size - 1);
+            h_lut_indexes[index] = inner_index + 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2 * group_size;
+          }
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+
        bool may_have_its_padding_bit_set =
            !is_in_first_grouping && (index_in_grouping == group_size - 1);

@@ -570,9 +550,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
        }
      }
      cuda_memcpy_with_size_tracking_async_to_gpu(
-          scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar,
-          safe_mul_sizeof<Torus>(nb), streams.stream(0), streams.gpu_index(0),
+          second_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
          allocate_gpu_memory);
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar, nb * sizeof(Torus),
+          streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    }
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_lut_indexes);
@@ -1008,12 +991,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
      masking_luts_1[i]->generate_and_broadcast_lut(
-          active_streams_1, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);

      auto active_streams_2 =
          streams.active_gpu_subset(num_blocks, params.pbs_type);
      masking_luts_2[i]->generate_and_broadcast_lut(
-          active_streams_2, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1036,7 +1019,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }

    // Give name to closures to improve readability
@@ -1064,11 +1047,11 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    zero_out_if_overflow_did_not_happen[0]
        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
-                                               LUT_0_FOR_ALL_BLOCKS, {},
+                                               gpu_memory_allocated, {},
                                               params.message_modulus - 2);
    zero_out_if_overflow_did_not_happen[1]
        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
-                                               LUT_0_FOR_ALL_BLOCKS, {},
+                                               gpu_memory_allocated, {},
                                               params.message_modulus - 1);

    // create and generate zero_out_if_overflow_happened
@@ -1087,10 +1070,10 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
    };

    zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
+        active_streams, {0}, {overflow_happened_f}, gpu_memory_allocated, {},
        params.message_modulus - 2);
    zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
+        active_streams, {0}, {overflow_happened_f}, gpu_memory_allocated, {},
        params.message_modulus - 1);

    // merge_overflow_flags_luts
@@ -1106,7 +1089,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_for_bits, {0}, {lut_f_bit}, LUT_0_FOR_ALL_BLOCKS);
+          active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
    }
  }

@@ -1164,89 +1147,71 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
    max_indexes_to_erase = num_blocks;

    first_indexes_for_overflow_sub =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
    second_indexes_for_overflow_sub =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
-    scalars_for_overflow_sub =
-        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));

-    Torus *h_lut_indexes = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));
-    Torus *h_scalar = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));
+    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));

    // Extra indexes for the luts in first step
    for (int nb = 1; nb <= num_blocks; nb++) {
      first_indexes_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              safe_mul_sizeof<Torus>(nb), streams.stream(0),
-              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-
-      auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
-        for (int index = 0; index < nb; index++) {
-          uint32_t grouping_index = index / group_size;
-          bool is_in_first_grouping = (grouping_index == 0);
-          uint32_t index_in_grouping = index % group_size;
-          bool is_last_index = (index == (nb - 1));
-          if (is_last_index) {
-            if (nb == 1) {
-              h_lut_indexes[index] = 2 * group_size;
-            } else {
-              h_lut_indexes[index] = 2;
-            }
-          } else if (is_in_first_grouping) {
-            h_lut_indexes[index] = index_in_grouping;
+              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+              size_tracker, allocate_gpu_memory);
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+        bool is_last_index = (index == (nb - 1));
+        if (is_last_index) {
+          if (nb == 1) {
+            h_lut_indexes[index] = 2 * group_size;
          } else {
-            h_lut_indexes[index] = index_in_grouping + group_size;
+            h_lut_indexes[index] = 2;
          }
+        } else if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
        }
-      };
-
-      generate_lut_indexes<Torus>(
-          streams, index_generator, first_indexes_for_overflow_sub[nb - 1], nb,
-          2 * group_size + 1, h_lut_indexes, allocate_gpu_memory);
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+          allocate_gpu_memory);
    }
    // Extra indexes for the luts in second step
-    uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
-    uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
    for (int nb = 1; nb <= num_blocks; nb++) {
      second_indexes_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              safe_mul_sizeof<Torus>(nb), streams.stream(0),
-              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+              size_tracker, allocate_gpu_memory);
      scalars_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              safe_mul_sizeof<Torus>(nb), streams.stream(0),
-              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-
-      auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
-                                                       uint32_t) {
-        for (int index = 0; index < nb; index++) {
-          uint32_t grouping_index = index / group_size;
-          bool is_in_first_grouping = (grouping_index == 0);
-          uint32_t index_in_grouping = index % group_size;
-
-          if (is_in_first_grouping) {
-            h_lut_indexes[index] = index_in_grouping;
-          } else if (index_in_grouping == (group_size - 1)) {
-            if (use_seq) {
-              int inner_index = (grouping_index - 1) % (group_size - 1);
-              h_lut_indexes[index] = inner_index + 2 * group_size;
-            } else {
-              h_lut_indexes[index] = 2 * group_size;
-            }
-          } else {
-            h_lut_indexes[index] = index_in_grouping + group_size;
-          }
-        }
-      };
-
-      generate_lut_indexes<Torus>(
-          streams, index_generator, second_indexes_for_overflow_sub[nb - 1], nb,
-          num_luts_second_step, h_lut_indexes, allocate_gpu_memory);
+              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+              size_tracker, allocate_gpu_memory);

      for (int index = 0; index < nb; index++) {
        uint32_t grouping_index = index / group_size;
        bool is_in_first_grouping = (grouping_index == 0);
        uint32_t index_in_grouping = index % group_size;
+
+        if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else if (index_in_grouping == (group_size - 1)) {
+          if (use_seq) {
+            int inner_index = (grouping_index - 1) % (group_size - 1);
+            h_lut_indexes[index] = inner_index + 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2 * group_size;
+          }
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+
        bool may_have_its_padding_bit_set =
            !is_in_first_grouping && (index_in_grouping == group_size - 1);

@@ -1261,9 +1226,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        }
      }
      cuda_memcpy_with_size_tracking_async_to_gpu(
-          scalars_for_overflow_sub[nb - 1], h_scalar,
-          safe_mul_sizeof<Torus>(nb), streams.stream(0), streams.gpu_index(0),
+          second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
          allocate_gpu_memory);
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
+          streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    }
    free(h_lut_indexes);
    free(h_scalar);
@@ -1535,7 +1503,7 @@ template <typename Torus> struct int_div_rem_memory {

      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
-          LUT_0_FOR_ALL_BLOCKS);
+          gpu_memory_allocated);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -54,7 +54,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
    };

    univ_lut_mem->generate_and_broadcast_lut(
-        active_streams, {0}, {generate_uni_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -66,7 +66,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
    };

    biv_lut_mem->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {generate_bi_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -234,7 +234,7 @@ template <typename Torus> struct int_ilog2_buffer {
    auto active_streams =
        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
    lut_message_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_message_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -245,7 +245,7 @@ template <typename Torus> struct int_ilog2_buffer {
      return (~carry) % this->params.message_modulus;
    };
    lut_carry_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -42,7 +42,7 @@ template <typename Torus> struct int_mul_memory {
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
          active_streams, {0}, {zero_out_predicate_lut_f},
-          LUT_0_FOR_ALL_BLOCKS);
+          gpu_memory_allocated);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -62,10 +62,6 @@ template <typename Torus> struct int_mul_memory {

    int total_block_count = num_radix_blocks * num_radix_blocks;

-    GPU_ASSERT(lsb_vector_block_count + msb_vector_block_count ==
-                   total_block_count,
-               "MSB and LSB vector block counts don't match");
-
    // allocate memory for intermediate buffers
    vector_result_sb = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -100,16 +96,16 @@ template <typename Torus> struct int_mul_memory {
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
    // for message and carry default lut_indexes_vec is fine
+    if (allocate_gpu_memory)
+      cuda_set_value_async<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
+          msb_vector_block_count);
+
    auto active_streams =
        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    auto lut_index_generator = [lsb_vector_block_count](Torus *h_lut_indexes,
-                                                        uint32_t num_indexes) {
-      for (uint32_t i = 0; i < num_indexes; i++) {
-        h_lut_indexes[i] = (i < lsb_vector_block_count) ? 0 : 1;
-      }
-    };
    luts_array->generate_and_broadcast_bivariate_lut(
-        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, lut_index_generator);
+        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);

    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -86,10 +86,9 @@ template <typename Torus> struct int_grouped_oprf_memory {
    // (handling both bounded and unbounded cases), which pre-computed LUT to
    // use, and the final plaintext correction to add.
    //
-    Torus *h_corrections = (Torus *)calloc(
-        1, safe_mul_sizeof<Torus>(num_blocks_to_process, lwe_size));
-    this->h_lut_indexes =
-        (Torus *)calloc(1, safe_mul_sizeof<Torus>(num_blocks_to_process));
+    Torus *h_corrections =
+        (Torus *)calloc(num_blocks_to_process * lwe_size, sizeof(Torus));
+    this->h_lut_indexes = (Torus *)calloc(num_blocks_to_process, sizeof(Torus));

    uint64_t bits_processed = 0;
    for (uint32_t i = 0; i < num_blocks_to_process; ++i) {
@@ -105,6 +104,10 @@ template <typename Torus> struct int_grouped_oprf_memory {
      Torus plaintext_to_add = (p - 1) * delta / 2;

      h_corrections[i * lwe_size + params.big_lwe_dimension] = plaintext_to_add;
+      if (bits_for_this_block < 1) {
+        PANIC("bits_for_this_block should be greater than 1");
+      }
+      this->h_lut_indexes[i] = bits_for_this_block - 1;

      bits_processed += bits_for_this_block;
    }
@@ -116,35 +119,22 @@ template <typename Torus> struct int_grouped_oprf_memory {
    // Copy the prepared plaintext corrections to the GPU.
    cuda_memcpy_with_size_tracking_async_to_gpu(
        this->plaintext_corrections->ptr, h_corrections,
-        safe_mul_sizeof<Torus>(num_blocks_to_process, lwe_size),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
+        num_blocks_to_process * lwe_size * sizeof(Torus), streams.stream(0),
+        streams.gpu_index(0), allocate_gpu_memory);

    // Copy the prepared LUT indexes to the GPU 0, before broadcast to all other
    // GPUs.
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        luts->get_lut_indexes(0, 0), this->h_lut_indexes,
+        num_blocks_to_process * sizeof(Torus), streams.stream(0),
+        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
+
    // No encoding for these LUTS. Generate LUT also sets LUT degrees to default
    // values
-    auto luts_index_generator = [total_random_bits, message_bits_per_block](
-                                    Torus *h_lut_indexes, uint32_t num_blocks) {
-      uint64_t bits_processed = 0;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        if (total_random_bits <= bits_processed) {
-          PANIC("total_random_bits should be greater than bits_processed");
-        }
-        uint64_t bits_remaining = total_random_bits - bits_processed;
-        uint32_t bits_for_this_block =
-            std::min((uint64_t)message_bits_per_block, bits_remaining);
-        if (bits_for_this_block < 1) {
-          PANIC("bits_for_this_block should be greater than 1");
-        }
-        h_lut_indexes[i] = bits_for_this_block - 1;
-        bits_processed += bits_for_this_block;
-      }
-    };
    luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
-                                     luts_index_generator, false, {},
-                                     this->h_lut_indexes);
+                                     allocate_gpu_memory, false);

    // OPRF requires custom LUT degrees
    for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/rerand.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/rerand.h
@@ -3,16 +3,17 @@
 #include "integer.h"

 extern "C" {
-uint64_t scratch_cuda_rerand_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, bool allocate_gpu_memory);
+uint64_t
+scratch_cuda_rerand_64(CudaStreamsFFI streams, int8_t **mem_ptr,
+                       uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
+                       uint32_t ks_level, uint32_t ks_base_log,
+                       uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+                       uint32_t carry_modulus, bool allocate_gpu_memory);

-void cuda_rerand_64_async(
+void cuda_rerand_64(
    CudaStreamsFFI streams, void *lwe_array,
    const void *lwe_flattened_encryptions_of_zero_compact_array_in,
    int8_t *mem_ptr, void *const *ksk);

-void cleanup_cuda_rerand_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+void cleanup_cuda_rerand(CudaStreamsFFI streams, int8_t **mem_ptr_void);
 }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h
@@ -1,6 +1,5 @@
 #pragma once

-#include "checked_arithmetic.h"
 #include "integer_utilities.h"
 #include "keyswitch/ks_enums.h"
 #include "zk/expand.cuh"
@@ -30,34 +29,34 @@ template <typename Torus> struct int_rerand_mem {
        gpu_memory_allocated(allocate_gpu_memory) {

    tmp_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, params.big_lwe_dimension + 1),
+        num_lwes * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

    tmp_ksed_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, params.small_lwe_dimension + 1),
+        num_lwes * (params.small_lwe_dimension + 1) * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
+            num_lwes * sizeof(expand_job<Torus>), streams.stream(0),
            streams.gpu_index(0), size_tracker, allocate_gpu_memory));

    h_expand_jobs = static_cast<expand_job<Torus> *>(
-        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
+        malloc(num_lwes * sizeof(expand_job<Torus>)));

    auto h_lwe_trivial_indexes =
-        static_cast<Torus *>(malloc(safe_mul_sizeof<Torus>(num_lwes)));
+        static_cast<Torus *>(malloc(num_lwes * sizeof(Torus)));
    for (auto i = 0; i < num_lwes; ++i) {
      h_lwe_trivial_indexes[i] = i;
    }
    lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        num_lwes * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);
    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_trivial_indexes,
-                             safe_mul_sizeof<Torus>(num_lwes),
-                             streams.stream(0), streams.gpu_index(0));
+                             num_lwes * sizeof(Torus), streams.stream(0),
+                             streams.gpu_index(0));

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));

@@ -68,19 +67,15 @@ template <typename Torus> struct int_rerand_mem {
    cuda_drop_with_size_tracking_async(tmp_zero_lwes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    tmp_zero_lwes = nullptr;
    cuda_drop_with_size_tracking_async(tmp_ksed_zero_lwes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    tmp_ksed_zero_lwes = nullptr;
    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    lwe_trivial_indexes = nullptr;
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    d_expand_jobs = nullptr;

    for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
      cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
@@ -90,6 +85,5 @@ template <typename Torus> struct int_rerand_mem {

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_expand_jobs);
-    h_expand_jobs = nullptr;
  }
 };
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -89,7 +89,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -171,7 +171,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -265,7 +265,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          streams.active_gpu_subset(1, params.pbs_type);
      shift_last_block_lut_univariate->generate_and_broadcast_lut(
          active_streams_shift_last, {0}, {last_block_lut_f},
-          LUT_0_FOR_ALL_BLOCKS);
+          gpu_memory_allocated);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -284,7 +284,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
    };

    padding_block_lut_univariate->generate_and_broadcast_lut(
-        active_streams, {0}, {padding_block_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -321,7 +321,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
          active_streams_shift_blocks, {0}, {blocks_lut_f},
-          LUT_0_FOR_ALL_BLOCKS);
+          gpu_memory_allocated);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -117,7 +117,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        bits_per_block * num_radix_blocks, params.pbs_type);

    mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
-                                        LUT_0_FOR_ALL_BLOCKS);
+                                        gpu_memory_allocated);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
@@ -126,7 +126,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
    auto active_gpu_count_cleaning =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    cleaning_lut->generate_and_broadcast_lut(
-        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, LUT_0_FOR_ALL_BLOCKS);
+        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -22,22 +22,21 @@ template <typename Torus> struct int_overflowing_sub_memory {
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
-    auto total_size_bytes =
-        safe_mul_sizeof<Torus>((size_t)num_radix_blocks, big_lwe_size);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

    // allocate memory for intermediate calculations
    generates_or_propagates = (Torus *)cuda_malloc_with_size_tracking_async(
-        total_size_bytes, streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
+        num_radix_blocks * big_lwe_size_bytes, streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    step_output = (Torus *)cuda_malloc_with_size_tracking_async(
-        total_size_bytes, streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
+        num_radix_blocks * big_lwe_size_bytes, streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    cuda_memset_with_size_tracking_async(
-        generates_or_propagates, 0, total_size_bytes, streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
+        generates_or_propagates, 0, num_radix_blocks * big_lwe_size_bytes,
+        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    cuda_memset_with_size_tracking_async(
-        step_output, 0, total_size_bytes, streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
+        step_output, 0, num_radix_blocks * big_lwe_size_bytes,
+        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

    // declare functions for lut generation
    auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -75,27 +74,26 @@ template <typename Torus> struct int_overflowing_sub_memory {
                                           luts_array, size_tracker,
                                           allocate_gpu_memory, size_tracker);

+    if (allocate_gpu_memory)
+      cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
+                                  luts_array->get_lut_indexes(0, 1), 1,
+                                  num_radix_blocks - 1);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
        active_streams, {0}, {f_luts_borrow_propagation_sum},
-        LUT_0_FOR_ALL_BLOCKS);
+        gpu_memory_allocated);

-    auto luts_array_index_generator = [](Torus *h_lut_indexes,
-                                         uint32_t num_indexes) {
-      for (uint32_t i = 0; i < num_indexes; i++) {
-        h_lut_indexes[i] = (i == 0) ? 0 : 1;
-      }
-    };
    luts_array->generate_and_broadcast_lut(
        active_streams, {0, 1},
        {f_lut_does_block_generate_carry,
         f_lut_does_block_generate_or_propagate},
-        luts_array_index_generator);
+        gpu_memory_allocated);
    // generate luts (aka accumulators)

    message_acc->generate_and_broadcast_lut(
-        active_streams, {0}, {f_message_acc}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -62,7 +62,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
    }

    this->comparison_luts->generate_and_broadcast_many_lut(
-        active_streams, {0}, {fns}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams, {0}, {fns}, allocate_gpu_memory);
    fns.clear();

    this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
@@ -196,7 +196,7 @@ template <typename Torus> struct int_possible_results_buffer {

        current_lut->generate_and_broadcast_many_lut(
            streams.active_gpu_subset(1, params.pbs_type), {0}, {fns},
-            LUT_0_FOR_ALL_BLOCKS);
+            allocate_gpu_memory);

        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
@@ -287,7 +287,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

      lut->generate_and_broadcast_lut(
          streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
-          LUT_0_FOR_ALL_BLOCKS);
+          allocate_gpu_memory);

      this->stream_identity_luts[i] = lut;
    }
@@ -304,14 +304,14 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->message_extract_lut->generate_and_broadcast_lut(
        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
-        LUT_0_FOR_ALL_BLOCKS);
+        allocate_gpu_memory);

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

    this->carry_extract_lut->generate_and_broadcast_lut(
        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
-        LUT_0_FOR_ALL_BLOCKS);
+        allocate_gpu_memory);

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -534,7 +534,7 @@ template <typename Torus> struct int_unchecked_match_value_or_buffer {
    this->tmp_or_value = new CudaRadixCiphertextFFI;

    this->d_or_value = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_final_blocks), streams.stream(0),
+        num_final_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), size_tracker, allocate_gpu_memory);

    create_zero_radix_ciphertext_async<Torus>(
@@ -712,8 +712,8 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
        allocate_gpu_memory);

    this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_blocks), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
@@ -866,7 +866,7 @@ template <typename Torus> struct int_final_index_from_selectors_buffer {
    uint32_t num_bits_in_message = log2_int(params.message_modulus);
    uint32_t bits_per_packed_block = 2 * num_bits_in_message;

-    h_indices = new uint64_t[safe_mul((size_t)num_inputs, (size_t)packed_len)];
+    h_indices = new uint64_t[num_inputs * packed_len];
    for (uint32_t i = 0; i < num_inputs; i++) {
      uint64_t val = i;
      for (uint32_t b = 0; b < packed_len; b++) {
@@ -1128,16 +1128,15 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
        allocate_gpu_memory);

    this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_blocks), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);

    h_indices = nullptr;
    if (allocate_gpu_memory) {
      uint32_t num_bits_in_message = log2_int(params.message_modulus);
      uint32_t bits_per_packed_block = 2 * num_bits_in_message;

-      h_indices =
-          new uint64_t[safe_mul((size_t)num_inputs, (size_t)packed_len)];
+      h_indices = new uint64_t[num_inputs * packed_len];
      for (uint32_t i = 0; i < num_inputs; i++) {
        uint64_t val = i;
        for (uint32_t b = 0; b < packed_len; b++) {
@@ -1157,11 +1156,11 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
      return current;
    };
    this->prefix_sum_lut = new int_radix_lut<Torus>(
-        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
+        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);
+        {prefix_sum_fn}, allocate_gpu_memory);

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1173,7 +1172,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
    this->cleanup_lut->generate_and_broadcast_lut(
        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
+        {cleanup_fn}, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
@@ -1317,8 +1316,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
      uint32_t num_bits_in_message = log2_int(params.message_modulus);
      uint32_t bits_per_packed_block = 2 * num_bits_in_message;

-      h_indices =
-          new uint64_t[safe_mul((size_t)num_inputs, (size_t)packed_len)];
+      h_indices = new uint64_t[num_inputs * packed_len];
      for (uint32_t i = 0; i < num_inputs; i++) {
        uint64_t val = i;
        for (uint32_t b = 0; b < packed_len; b++) {
@@ -1338,11 +1336,11 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
      return current;
    };
    this->prefix_sum_lut = new int_radix_lut<Torus>(
-        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
+        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);
+        {prefix_sum_fn}, allocate_gpu_memory);

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1354,7 +1352,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
    this->cleanup_lut->generate_and_broadcast_lut(
        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
+        {cleanup_fn}, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -5,83 +5,73 @@

 extern "C" {

-void cuda_keyswitch_lwe_ciphertext_vector_64_64_async(
+void cuda_keyswitch_lwe_ciphertext_vector_64_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples);

-void cuda_keyswitch_lwe_ciphertext_vector_64_32_async(
+void cuda_keyswitch_lwe_ciphertext_vector_64_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples);

-uint64_t scratch_cuda_packing_keyswitch_lwe_list_to_glwe_64_async(
+uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory);

-void cuda_keyswitch_gemm_64_64_async(
+void cuda_keyswitch_gemm_lwe_ciphertext_vector_64_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);

-void cuda_keyswitch_gemm_64_32_async(
+void cuda_keyswitch_gemm_lwe_ciphertext_vector_64_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);

-uint64_t scratch_cuda_keyswitch_gemm_64_64_async(
-    void *stream, uint32_t gpu_index, void **ks_tmp_memory,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t num_lwes,
-    bool allocate_gpu_memory);
+uint64_t scratch_cuda_keyswitch_gemm_64(void *stream, uint32_t gpu_index,
+                                        void **ks_tmp_memory,
+                                        uint32_t lwe_dimension_in,
+                                        uint32_t lwe_dimension_out,
+                                        uint32_t num_lwes,
+                                        bool allocate_gpu_memory);

-uint64_t scratch_cuda_keyswitch_gemm_64_32_async(
-    void *stream, uint32_t gpu_index, void **ks_tmp_memory,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t num_lwes,
-    bool allocate_gpu_memory);
+void cleanup_cuda_keyswitch_gemm_64(void *stream, uint32_t gpu_index,
+                                    void **ks_tmp_memory,
+                                    bool allocate_gpu_memory);

-void cleanup_cuda_keyswitch_gemm_64_64(void *stream, uint32_t gpu_index,
-                                       void **ks_tmp_memory,
-                                       bool allocate_gpu_memory);
-
-void cleanup_cuda_keyswitch_gemm_64_32(void *stream, uint32_t gpu_index,
-                                       void **ks_tmp_memory,
-                                       bool allocate_gpu_memory);
-
-void cuda_packing_keyswitch_lwe_list_to_glwe_64_async(
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, void *glwe_array_out,
    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
    uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes);

-void scratch_cuda_packing_keyswitch_lwe_list_to_glwe_128_async(
+void scratch_packing_keyswitch_lwe_list_to_glwe_128(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory);

-void cuda_packing_keyswitch_lwe_list_to_glwe_128_async(
+void cuda_packing_keyswitch_lwe_list_to_glwe_128(
    void *stream, uint32_t gpu_index, void *glwe_array_out,
    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
    uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes);

-void cleanup_cuda_packing_keyswitch_lwe_list_to_glwe_64(
-    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    bool gpu_memory_allocated);
-
-void cleanup_cuda_packing_keyswitch_lwe_list_to_glwe_128(
-    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    bool gpu_memory_allocated);
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer,
+                                                bool gpu_memory_allocated);

 void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
                                         void const *input, void *output,
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
@@ -4,7 +4,7 @@
 #include "../integer/integer.h"

 extern "C" {
-uint64_t scratch_cuda_kreyvium_generate_keystream_64_async(
+uint64_t scratch_cuda_kreyvium_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -12,14 +12,13 @@ uint64_t scratch_cuda_kreyvium_generate_keystream_64_async(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);

-void cuda_kreyvium_generate_keystream_64_async(
+void cuda_kreyvium_generate_keystream_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks);

-void cleanup_cuda_kreyvium_generate_keystream_64(CudaStreamsFFI streams,
-                                                 int8_t **mem_ptr_void);
+void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -43,19 +43,20 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);

-void scratch_cuda_wrapping_polynomial_mul_one_to_many_64_async(
-    void *stream, uint32_t gpu_index, uint32_t polynomial_size,
-    int8_t **circulant_buf);
+void scratch_wrapping_polynomial_mul_one_to_many_64(void *stream,
+                                                    uint32_t gpu_index,
+                                                    uint32_t polynomial_size,
+                                                    int8_t **circulant_buf);

-void cleanup_cuda_wrapping_polynomial_mul_one_to_many_64(void *stream,
-                                                         uint32_t gpu_index,
-                                                         int8_t *circulant_buf);
+void cleanup_wrapping_polynomial_mul_one_to_many_64(void *stream,
+                                                    uint32_t gpu_index,
+                                                    int8_t *circulant_buf);

-void cuda_wrapping_polynomial_mul_one_to_many_64_async(
+void cuda_wrapping_polynomial_mul_one_to_many_64(
    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
    int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
    uint32_t n_rhs);
-void cuda_glwe_wrapping_polynomial_mul_one_to_many_64_async(
+void cuda_glwe_wrapping_polynomial_mul_one_to_many_64(
    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
    int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
    uint32_t glwe_dimension, uint32_t n_rhs);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -1,7 +1,6 @@
 #ifndef CUDA_MULTI_BIT_UTILITIES_H
 #define CUDA_MULTI_BIT_UTILITIES_H

-#include "checked_arithmetic.h"
 #include "pbs_utilities.h"

 template <typename Torus>
@@ -106,8 +105,7 @@ uint64_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
                                uint32_t polynomial_size,
                                uint32_t glwe_dimension, uint32_t level_count,
                                uint64_t full_sm_keybundle);
-template <typename Torus>
-struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
+template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
  int8_t *d_mem_acc_step_two = NULL;
@@ -154,18 +152,15 @@ struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
            polynomial_size);

-    size_t num_blocks_keybundle = safe_mul(
-        (size_t)input_lwe_ciphertext_count, (size_t)lwe_chunk_size,
-        safe_mul((size_t)(glwe_dimension + 1), (size_t)(glwe_dimension + 1)),
-        (size_t)level_count);
-    size_t num_blocks_acc_step_one =
-        safe_mul((size_t)level_count, (size_t)(glwe_dimension + 1),
-                 (size_t)input_lwe_ciphertext_count);
-    size_t num_blocks_acc_step_two = safe_mul(
-        (size_t)input_lwe_ciphertext_count, (size_t)(glwe_dimension + 1));
-    size_t num_blocks_acc_cg =
-        safe_mul((size_t)level_count, (size_t)(glwe_dimension + 1),
-                 (size_t)input_lwe_ciphertext_count);
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;

 #if CUDA_ARCH >= 900
    uint64_t full_sm_tbc_accumulate =
@@ -177,13 +172,13 @@ struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
    uint64_t minimum_sm_tbc =
        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
            polynomial_size);
-    size_t num_blocks_acc_tbc = num_blocks_acc_cg;
+    auto num_blocks_acc_tbc = num_blocks_acc_cg;
 #endif

    // Keybundle
    if (max_shared_memory < full_sm_keybundle)
      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
-          safe_mul(num_blocks_keybundle, full_sm_keybundle), stream, gpu_index,
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
          size_tracker, allocate_gpu_memory);

    switch (pbs_variant) {
@@ -191,29 +186,29 @@ struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
      // Accumulator CG
      if (max_shared_memory < partial_sm_cg_accumulate)
        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_cg, full_sm_cg_accumulate), stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      else if (max_shared_memory < full_sm_cg_accumulate)
        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_cg, partial_sm_cg_accumulate), stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      break;
    case PBS_VARIANT::DEFAULT:
      // Accumulator step one
      if (max_shared_memory < partial_sm_accumulate_step_one)
        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_step_one, full_sm_accumulate_step_one),
-            stream, gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
      else if (max_shared_memory < full_sm_accumulate_step_one)
        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_step_one, partial_sm_accumulate_step_one),
-            stream, gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);

      // Accumulator step two
      if (max_shared_memory < full_sm_accumulate_step_two)
        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_step_two, full_sm_accumulate_step_two),
-            stream, gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
      break;
 #if CUDA_ARCH >= 900
    case TBC:
@@ -230,12 +225,12 @@ struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
      // Accumulator TBC
      if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
        d_mem_acc_tbc = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_tbc, full_sm_tbc_accumulate), stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
        d_mem_acc_tbc = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_tbc, partial_sm_tbc_accumulate), stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      break;
 #endif
    default:
@@ -243,22 +238,19 @@ struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
    }

    keybundle_fft = (double2 *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<double2>(num_blocks_keybundle,
-                                 (size_t)(polynomial_size / 2)),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
+        num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2), stream,
+        gpu_index, size_tracker, allocate_gpu_memory);
    global_accumulator = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>((size_t)input_lwe_ciphertext_count,
-                               (size_t)(glwe_dimension + 1),
-                               (size_t)polynomial_size),
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(Torus),
        stream, gpu_index, size_tracker, allocate_gpu_memory);
    global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<double2>(
-            safe_mul((size_t)level_count, (size_t)(glwe_dimension + 1)),
-            (size_t)input_lwe_ciphertext_count, (size_t)(polynomial_size / 2)),
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * sizeof(double2),
        stream, gpu_index, size_tracker, allocate_gpu_memory);
  }

-  void release(cudaStream_t stream, uint32_t gpu_index) override {
+  void release(cudaStream_t stream, uint32_t gpu_index) {

    if (d_mem_keybundle)
      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
@@ -294,14 +286,11 @@ struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> : public pbs_buffer_base {
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
                                       gpu_memory_allocated);
-
-    cuda_synchronize_stream(stream, gpu_index);
  }
 };

 template <typename InputTorus>
-struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT>
-    : public pbs_buffer_base {
+struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
  int8_t *d_mem_acc_step_two = NULL;
@@ -348,23 +337,20 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT>
        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
            __uint128_t>(polynomial_size);

-    size_t num_blocks_keybundle = safe_mul(
-        (size_t)input_lwe_ciphertext_count, (size_t)lwe_chunk_size,
-        safe_mul((size_t)(glwe_dimension + 1), (size_t)(glwe_dimension + 1)),
-        (size_t)level_count);
-    size_t num_blocks_acc_step_one =
-        safe_mul((size_t)level_count, (size_t)(glwe_dimension + 1),
-                 (size_t)input_lwe_ciphertext_count);
-    size_t num_blocks_acc_step_two = safe_mul(
-        (size_t)input_lwe_ciphertext_count, (size_t)(glwe_dimension + 1));
-    size_t num_blocks_acc_cg =
-        safe_mul((size_t)level_count, (size_t)(glwe_dimension + 1),
-                 (size_t)input_lwe_ciphertext_count);
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;

    // Keybundle
    if (max_shared_memory < full_sm_keybundle)
      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
-          safe_mul(num_blocks_keybundle, full_sm_keybundle), stream, gpu_index,
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
          size_tracker, allocate_gpu_memory);

    switch (pbs_variant) {
@@ -372,52 +358,48 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT>
      // Accumulator CG
      if (max_shared_memory < partial_sm_cg_accumulate)
        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_cg, full_sm_cg_accumulate), stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      else if (max_shared_memory < full_sm_cg_accumulate)
        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_cg, partial_sm_cg_accumulate), stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      break;
    case PBS_VARIANT::DEFAULT:
      // Accumulator step one
      if (max_shared_memory < partial_sm_accumulate_step_one)
        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_step_one, full_sm_accumulate_step_one),
-            stream, gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
      else if (max_shared_memory < full_sm_accumulate_step_one)
        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_step_one, partial_sm_accumulate_step_one),
-            stream, gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);

      // Accumulator step two
      if (max_shared_memory < full_sm_accumulate_step_two)
        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
-            safe_mul(num_blocks_acc_step_two, full_sm_accumulate_step_two),
-            stream, gpu_index, size_tracker, allocate_gpu_memory);
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
      break;
    default:
      PANIC("Cuda error (PBS): unsupported implementation variant.")
    }

    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<double>((size_t)num_blocks_keybundle,
-                                (size_t)(polynomial_size / 2), (size_t)4),
+        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
        stream, gpu_index, size_tracker, allocate_gpu_memory);
    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<__uint128_t>((size_t)input_lwe_ciphertext_count,
-                                     (size_t)(glwe_dimension + 1),
-                                     (size_t)polynomial_size),
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(__uint128_t),
        stream, gpu_index, size_tracker, allocate_gpu_memory);
    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<double>(
-            safe_mul((size_t)level_count, (size_t)(glwe_dimension + 1)),
-            (size_t)input_lwe_ciphertext_count,
-            safe_mul((size_t)(polynomial_size / 2), (size_t)4)),
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * 4 * sizeof(double),
        stream, gpu_index, size_tracker, allocate_gpu_memory);
  }

-  void release(cudaStream_t stream, uint32_t gpu_index) override {
+  void release(cudaStream_t stream, uint32_t gpu_index) {

    if (d_mem_keybundle)
      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
@@ -446,7 +428,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT>
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
                                       gpu_memory_allocated);
-    cuda_synchronize_stream(stream, gpu_index);
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -1,7 +1,6 @@
 #ifndef CUDA_BOOTSTRAP_UTILITIES_H
 #define CUDA_BOOTSTRAP_UTILITIES_H

-#include "checked_arithmetic.h"
 #include "device.h"
 #include "pbs_enums.h"
 #include "vector_types.h"
@@ -11,46 +10,45 @@ template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
    uint32_t polynomial_size) {
  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
-  return safe_mul_sizeof<Torus>(polynomial_size) + // accumulator_rotated
-         safe_mul_sizeof<double>(double_count,
-                                 (size_t)polynomial_size); // accumulator fft
+  return sizeof(Torus) * polynomial_size + // accumulator_rotated
+         sizeof(double) * 2 * double_count * polynomial_size /
+             2; // accumulator fft
 }
 template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
    uint32_t polynomial_size) {
  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
-  return safe_mul_sizeof<Torus>(polynomial_size) + // accumulator
-         safe_mul_sizeof<double>(double_count,
-                                 (size_t)polynomial_size); // accumulator fft
+  return sizeof(Torus) * polynomial_size + // accumulator
+         sizeof(double) * 2 * double_count * polynomial_size /
+             2; // accumulator fft
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
-  return safe_mul_sizeof<double>(double_count,
-                                 (size_t)polynomial_size); // accumulator fft
+  return sizeof(double) * 2 * double_count * polynomial_size /
+         2; // accumulator fft
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
-  return safe_mul_sizeof<Torus>(polynomial_size) +      // accumulator_rotated
-         safe_mul_sizeof<Torus>(polynomial_size) +      // accumulator
-         safe_mul_sizeof<double2>(polynomial_size / 2); // accumulator fft
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
 }

 template <typename Torus>
 uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
    uint32_t polynomial_size) {
-  return safe_mul_sizeof<double2>(polynomial_size /
-                                  2); // accumulator fft mask & body
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
 }

 template <typename Torus>
 uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
    uint32_t polynomial_size) {
-  return safe_mul_sizeof<double2>(polynomial_size / 2); // tbc
+  return sizeof(double2) * polynomial_size / 2; // tbc
 }

 template <typename Torus>
@@ -58,40 +56,34 @@ uint64_t get_buffer_size_full_sm_programmable_bootstrap_tbc_2_2_params(
    uint32_t polynomial_size) {
  // In the first implementation with 2-2 params, we need up to 5 polynomials in
  // shared memory we can optimize this later
-  return safe_mul_sizeof<Torus>((size_t)polynomial_size, (size_t)5);
+  return sizeof(Torus) * polynomial_size * 5;
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
-  return safe_mul_sizeof<Torus>(polynomial_size) + // accumulator_rotated
-         safe_mul_sizeof<Torus>(polynomial_size) + // accumulator
-         safe_mul_sizeof<double>((size_t)polynomial_size,
-                                 double_count); // accumulator fft
+  return sizeof(Torus) * polynomial_size + // accumulator_rotated
+         sizeof(Torus) * polynomial_size + // accumulator
+         sizeof(double) * polynomial_size / 2 * 2 *
+             double_count; // accumulator fft
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
-  return safe_mul_sizeof<double>((size_t)polynomial_size,
-                                 double_count); // accumulator fft mask & body
+  return sizeof(double) * polynomial_size / 2 * 2 *
+         double_count; // accumulator fft mask & body
 }

 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t max_shared_memory);

-struct pbs_buffer_base {
-  virtual void release(cudaStream_t stream, uint32_t gpu_index) = 0;
-  virtual ~pbs_buffer_base() = default;
-};
-
 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

-template <typename Torus>
-struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
+template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  Torus *global_accumulator;
@@ -130,34 +122,27 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {

      uint64_t device_mem = 0;
      if (max_shared_memory < partial_sm) {
-        device_mem =
-            safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm_step_two) {
-        device_mem = safe_mul(
-            partial_dm_step_two +
-                safe_mul(partial_dm_step_one, (size_t)level_count),
-            (size_t)input_lwe_ciphertext_count, (size_t)(glwe_dimension + 1));
+        device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
+                     input_lwe_ciphertext_count * (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm_step_one) {
-        device_mem =
-            safe_mul(partial_dm_step_one, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                     level_count * (glwe_dimension + 1);
      }
      // Otherwise, both kernels run all in shared memory
      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);

      global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<double2>(
-              safe_mul((size_t)(glwe_dimension + 1), (size_t)level_count),
-              (size_t)input_lwe_ciphertext_count,
-              (size_t)(polynomial_size / 2)),
+          (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+              (polynomial_size / 2) * sizeof(double2),
          stream, gpu_index, size_tracker, allocate_gpu_memory);

      global_accumulator = (Torus *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<Torus>((size_t)(glwe_dimension + 1),
-                                 (size_t)input_lwe_ciphertext_count,
-                                 (size_t)polynomial_size),
+          (glwe_dimension + 1) * input_lwe_ciphertext_count * polynomial_size *
+              sizeof(Torus),
          stream, gpu_index, size_tracker, allocate_gpu_memory);
    } break;
    case PBS_VARIANT::CG: {
@@ -173,13 +158,11 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
      uint64_t device_mem = 0;

      if (max_shared_memory < partial_sm) {
-        device_mem =
-            safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm) {
-        device_mem =
-            safe_mul(partial_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      }

      // Otherwise, both kernels run all in shared memory
@@ -187,10 +170,8 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);

      global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<double2>(
-              safe_mul((size_t)(glwe_dimension + 1), (size_t)level_count),
-              (size_t)input_lwe_ciphertext_count,
-              (size_t)(polynomial_size / 2)),
+          (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+              polynomial_size / 2 * sizeof(double2),
          stream, gpu_index, size_tracker, allocate_gpu_memory);
    } break;
 #if CUDA_ARCH >= 900
@@ -225,13 +206,11 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
      //
      // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
      if (max_shared_memory < partial_sm + minimum_sm_tbc) {
-        device_mem =
-            safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-        device_mem =
-            safe_mul(partial_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      }

      // Otherwise, both kernels run all in shared memory
@@ -239,10 +218,8 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);

      global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<double2>(
-              safe_mul((size_t)(glwe_dimension + 1), (size_t)level_count),
-              (size_t)input_lwe_ciphertext_count,
-              (size_t)(polynomial_size / 2)),
+          (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+              polynomial_size / 2 * sizeof(double2),
          stream, gpu_index, size_tracker, allocate_gpu_memory);
    } break;
 #endif
@@ -251,7 +228,7 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
    }
  }

-  void release(cudaStream_t stream, uint32_t gpu_index) override {
+  void release(cudaStream_t stream, uint32_t gpu_index) {
    cuda_drop_with_size_tracking_async(d_mem, stream, gpu_index,
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
@@ -260,15 +237,13 @@ struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> : public pbs_buffer_base {
    if (pbs_variant == DEFAULT)
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);
-    cuda_synchronize_stream(stream, gpu_index);
  }
 };

 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;

 template <typename InputTorus>
-struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
-    : public pbs_buffer_base {
+struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
@@ -291,10 +266,9 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
    this->pbs_variant = pbs_variant;

    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-    size_t global_join_buffer_size = safe_mul_sizeof<double>(
-        safe_mul((size_t)(glwe_dimension + 1), (size_t)level_count),
-        (size_t)input_lwe_ciphertext_count,
-        safe_mul((size_t)(polynomial_size / 2), (size_t)4));
+    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
+                                     input_lwe_ciphertext_count *
+                                     polynomial_size / 2 * sizeof(double) * 4;

    switch (pbs_variant) {
    case PBS_VARIANT::DEFAULT: {
@@ -314,18 +288,14 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>

      uint64_t device_mem = 0;
      if (max_shared_memory < partial_sm) {
-        device_mem =
-            safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm_step_two) {
-        device_mem = safe_mul(
-            partial_dm_step_two +
-                safe_mul(partial_dm_step_one, (size_t)level_count),
-            (size_t)input_lwe_ciphertext_count, (size_t)(glwe_dimension + 1));
+        device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
+                     input_lwe_ciphertext_count * (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm_step_one) {
-        device_mem =
-            safe_mul(partial_dm_step_one, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                     level_count * (glwe_dimension + 1);
      }
      // Otherwise, both kernels run all in shared memory
      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
@@ -336,9 +306,8 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
          allocate_gpu_memory);

      global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<__uint128_t>((size_t)(glwe_dimension + 1),
-                                       (size_t)input_lwe_ciphertext_count,
-                                       (size_t)polynomial_size),
+          (glwe_dimension + 1) * input_lwe_ciphertext_count * polynomial_size *
+              sizeof(__uint128_t),
          stream, gpu_index, size_tracker, allocate_gpu_memory);
    } break;
    case PBS_VARIANT::CG: {
@@ -354,13 +323,11 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
      uint64_t device_mem = 0;

      if (max_shared_memory < partial_sm) {
-        device_mem =
-            safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm) {
-        device_mem =
-            safe_mul(partial_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      }

      // Otherwise, both kernels run all in shared memory
@@ -403,13 +370,11 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
      //
      // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
      if (max_shared_memory < partial_sm + minimum_sm_tbc) {
-        device_mem =
-            safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-        device_mem =
-            safe_mul(partial_dm, (size_t)input_lwe_ciphertext_count,
-                     (size_t)level_count, (size_t)(glwe_dimension + 1));
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
      }

      // Otherwise, both kernels run all in shared memory
@@ -426,7 +391,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
    }
  }

-  void release(cudaStream_t stream, uint32_t gpu_index) override {
+  void release(cudaStream_t stream, uint32_t gpu_index) {
    cuda_drop_with_size_tracking_async(d_mem, stream, gpu_index,
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
@@ -435,7 +400,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>
    if (pbs_variant == DEFAULT)
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);
-    cuda_synchronize_stream(stream, gpu_index);
  }
 };

@@ -452,17 +416,15 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
  uint64_t full_dm = full_sm;
  uint64_t device_mem = 0;
  if (max_shared_memory < partial_sm) {
-    device_mem = safe_mul(full_dm, (size_t)input_lwe_ciphertext_count,
-                          (size_t)level_count, (size_t)(glwe_dimension + 1));
+    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
  } else if (max_shared_memory < full_sm) {
-    device_mem = safe_mul(partial_dm, (size_t)input_lwe_ciphertext_count,
-                          (size_t)level_count, (size_t)(glwe_dimension + 1));
+    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
  }
-  uint64_t buffer_size =
-      device_mem +
-      safe_mul_sizeof<double2>(
-          safe_mul((size_t)(glwe_dimension + 1), (size_t)level_count),
-          (size_t)input_lwe_ciphertext_count, (size_t)(polynomial_size / 2));
+  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
+                                          input_lwe_ciphertext_count *
+                                          polynomial_size / 2 * sizeof(double2);
  return buffer_size + buffer_size % sizeof(double2);
 }

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -5,51 +5,77 @@
 #include <stdint.h>

 extern "C" {
-void cuda_fourier_polynomial_mul_async(void *stream, uint32_t gpu_index,
-                                       void const *input1, void const *input2,
-                                       void *output, uint32_t polynomial_size,
-                                       uint32_t total_polynomials);
+void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
+                                 void const *input1, void const *input2,
+                                 void *output, uint32_t polynomial_size,
+                                 uint32_t total_polynomials);

-void cuda_convert_lwe_programmable_bootstrap_key_32_async(
+void cuda_convert_lwe_programmable_bootstrap_key_32(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size);

-void cuda_convert_lwe_programmable_bootstrap_key_64_async(
+void cuda_convert_lwe_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size);

-void cuda_convert_lwe_programmable_bootstrap_key_128_async(
+void cuda_convert_lwe_programmable_bootstrap_key_128(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size);

-uint64_t scratch_cuda_programmable_bootstrap_64_async(
+uint64_t scratch_cuda_programmable_bootstrap_amortized_32(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+uint64_t scratch_cuda_programmable_bootstrap_amortized_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
+                                                   uint32_t gpu_index,
+                                                   int8_t **pbs_buffer);
+
+uint64_t scratch_cuda_programmable_bootstrap_32(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-uint64_t scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
+uint64_t scratch_cuda_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-uint64_t scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
+uint64_t scratch_cuda_programmable_bootstrap_128(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-uint64_t scratch_cuda_programmable_bootstrap_128_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_async(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
@@ -58,7 +84,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_async(
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

-void cuda_programmable_bootstrap_64_async(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
@@ -67,33 +93,15 @@ void cuda_programmable_bootstrap_64_async(
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

-void cuda_programmable_bootstrap_tbc_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
-
-void cuda_programmable_bootstrap_tbc_64_2_2_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
-
-void cuda_programmable_bootstrap_128_async(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples);

-void cleanup_cuda_programmable_bootstrap_64(void *stream, uint32_t gpu_index,
-                                            int8_t **pbs_buffer);
+void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer);

 void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
                                             int8_t **pbs_buffer);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -10,32 +10,22 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t num_samples, uint32_t max_shared_memory);

-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64_async(
+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128_async(
+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64_async(
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cuda_multi_bit_programmable_bootstrap_64_async(
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
@@ -45,36 +35,16 @@ void cuda_multi_bit_programmable_bootstrap_64_async(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

-void cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
+void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
+                                                   uint32_t gpu_index,
+                                                   int8_t **pbs_buffer);

-void cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
-                                                      uint32_t gpu_index,
-                                                      int8_t **pbs_buffer);
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-void cuda_multi_bit_programmable_bootstrap_128_async(
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lwe_array_in, void const *lwe_input_indexes,
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
@@ -4,7 +4,7 @@
 #include "../integer/integer.h"

 extern "C" {
-uint64_t scratch_cuda_trivium_generate_keystream_64_async(
+uint64_t scratch_cuda_trivium_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -12,14 +12,13 @@ uint64_t scratch_cuda_trivium_generate_keystream_64_async(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);

-void cuda_trivium_generate_keystream_64_async(
+void cuda_trivium_generate_keystream_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks);

-void cleanup_cuda_trivium_generate_keystream_64(CudaStreamsFFI streams,
-                                                int8_t **mem_ptr_void);
+void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
@@ -33,7 +33,7 @@ template <typename Torus> struct int_trivium_lut_buffers {
    auto active_streams_and =
        streams.active_gpu_subset(total_lut_ops, params.pbs_type);
    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -48,7 +48,7 @@ template <typename Torus> struct int_trivium_lut_buffers {
    auto active_streams_flush =
        streams.active_gpu_subset(total_flush_ops, params.pbs_type);
    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
+        active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

@@ -254,8 +254,6 @@ template <typename Torus> struct int_trivium_state_workspaces {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   this->packed_flush_out, allocate_gpu_memory);
    delete this->packed_flush_out;
-
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -3,11 +3,10 @@

 #include "../keyswitch/ks_enums.h"
 #include "../pbs/pbs_enums.h"
-#include "zk_enums.h"
 #include <stdint.h>

 extern "C" {
-uint64_t scratch_cuda_expand_without_verification_64_async(
+uint64_t scratch_cuda_expand_without_verification_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t computing_ks_level,
@@ -18,15 +17,14 @@ uint64_t scratch_cuda_expand_without_verification_64_async(
    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
    uint32_t num_compact_lists, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
-    bool allocate_gpu_memory, EXPAND_KIND expand_kind,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_expand_without_verification_64_async(
+void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
    const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *computing_ksks, void *const *casting_keys);

-void cleanup_cuda_expand_without_verification_64(CudaStreamsFFI streams,
-                                                 int8_t **mem_ptr_void);
+void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
+                                            int8_t **mem_ptr_void);
 }
 #endif // ZK_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_enums.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_enums.h
@@ -1,7 +0,0 @@
-#ifndef CUDA_ZK_ENUMS_H
-#define CUDA_ZK_ENUMS_H
-#include <stdint.h>
-// Additional to the two kinds of expand (no_casting and casting), we have a
-// third that is used only in the noise tests
-enum EXPAND_KIND { NO_CASTING = 0, CASTING = 1, SANITY_CHECK = 2 };
-#endif // CUDA_ZK_ENUMS_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -1,7 +1,7 @@
 #ifndef ZK_UTILITIES_H
 #define ZK_UTILITIES_H
+
 #include "../integer/integer_utilities.h"
-#include "checked_arithmetic.h"
 #include "integer/integer.cuh"
 #include <cstdint>

@@ -59,8 +59,8 @@ template <typename Torus> struct flattened_compact_lwe_lists {
                              uint32_t lwe_dimension)
      : d_ptr(d_ptr), h_num_lwes_per_compact_list(h_num_lwes_per_compact_list),
        num_compact_lists(num_compact_lists), lwe_dimension(lwe_dimension) {
-    ptr_array_to_d_compact_list = static_cast<Torus **>(
-        malloc(safe_mul_sizeof<Torus *>(num_compact_lists)));
+    ptr_array_to_d_compact_list =
+        static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus *)));
    total_num_lwes = 0;
    auto curr_list = d_ptr;
    for (auto i = 0; i < num_compact_lists; ++i) {
@@ -80,7 +80,6 @@ template <typename Torus> struct flattened_compact_lwe_lists {
                            h_num_lwes_per_compact_list[compact_list_index]);
  }

-  // nosemgrep: release-missing-cuda-synchronize
  void release() { free(ptr_array_to_d_compact_list); }
 };

@@ -106,7 +105,6 @@ template <typename Torus> struct zk_expand_mem {
  uint32_t num_compact_lists;

  int_radix_lut<Torus> *message_and_carry_extract_luts;
-  int_radix_lut<Torus> *identity_lut;

  Torus *tmp_expanded_lwes;
  Torus *tmp_ksed_small_to_big_expanded_lwes;
@@ -117,49 +115,33 @@ template <typename Torus> struct zk_expand_mem {
  expand_job<Torus> *d_expand_jobs;
  expand_job<Torus> *h_expand_jobs;

-  EXPAND_KIND expand_kind;
-
  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
                const bool *is_boolean_array,
                const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
-                bool allocate_gpu_memory, uint64_t &size_tracker,
-                EXPAND_KIND expand_kind_in)
+                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
-        casting_key_type(casting_key_type), expand_kind(expand_kind_in) {
+        casting_key_type(casting_key_type) {
    gpu_memory_allocated = allocate_gpu_memory;

    // We copy num_lwes_per_compact_list so we get protection against
    // num_lwes_per_compact_list being freed while this buffer is still in use
    this->num_lwes_per_compact_list =
-        (uint32_t *)malloc(safe_mul_sizeof<uint32_t>(num_compact_lists));
+        (uint32_t *)malloc(num_compact_lists * sizeof(uint32_t));
    memcpy(this->num_lwes_per_compact_list, num_lwes_per_compact_list,
-           safe_mul_sizeof<uint32_t>(num_compact_lists));
+           num_compact_lists * sizeof(uint32_t));

    num_lwes = 0;
    for (int i = 0; i < num_compact_lists; i++) {
      num_lwes += this->num_lwes_per_compact_list[i];
    }

-    if (computing_params.carry_modulus != computing_params.message_modulus &&
-        expand_kind == EXPAND_KIND::CASTING) {
+    if (computing_params.carry_modulus != computing_params.message_modulus) {
      PANIC("GPU backend requires carry_modulus equal to message_modulus")
    }

-    // We create the identity LUT only if we are doing a SANITY_CHECK
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut =
-          new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
-                                   allocate_gpu_memory, size_tracker);
-
-      auto identity_lut_f = [](Torus x) -> Torus { return x; };
-
-      identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
-                                               LUT_0_FOR_ALL_BLOCKS);
-    }
-
    auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
      return x % casting_params.message_modulus;
    };
@@ -202,19 +184,19 @@ template <typename Torus> struct zk_expand_mem {

    // Adjust indexes to permute the output and access the correct LUT
    auto h_indexes_in = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_indexes_out = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_lut_indexes = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));

    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
+            num_lwes * sizeof(expand_job<Torus>), streams.stream(0),
            streams.gpu_index(0), size_tracker, allocate_gpu_memory));

    h_expand_jobs = static_cast<expand_job<Torus> *>(
-        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
+        malloc(num_lwes * sizeof(expand_job<Torus>)));

    /*
     * Each LWE contains encrypted data in both carry and message spaces
@@ -279,54 +261,45 @@ template <typename Torus> struct zk_expand_mem {
                       "Cuda error: index %d for is_boolean_array is out of "
                       "bounds (len is %d)",
                       h_indexes_out[lwe_index], is_boolean_array_len);
+        auto boolean_offset =
+            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
+        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+        PANIC_IF_FALSE(
+            h_lut_indexes[lwe_index] < 4,
+            "Cuda error: lut index is greater than the max possible value (3)");
      }
      offset += num_lwes_in_kth;
    }

    message_and_carry_extract_luts->set_lwe_indexes(
        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
+    auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
+
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
+        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

    auto active_streams =
        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);

-    // Index generator for message/carry extraction LUTs
-    auto index_gen = [num_compact_lists,
-                      num_lwes_per_compact_list =
-                          this->num_lwes_per_compact_list,
-                      num_packed_msgs, is_boolean_array,
-                      h_indexes_out](Torus *h_lut_indexes, uint32_t) {
-      auto offset = 0;
-      for (int k = 0; k < num_compact_lists; k++) {
-        auto num_lwes_in_kth = num_lwes_per_compact_list[k];
-        for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
-          auto lwe_index = i + num_packed_msgs * offset;
-          auto boolean_offset =
-              is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
-          h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
-        }
-        offset += num_lwes_in_kth;
-      }
-    };
-
    message_and_carry_extract_luts->generate_and_broadcast_lut(
        active_streams, {0, 1, 2, 3},
        {message_extract_lut_f, carry_extract_lut_f,
         message_extract_and_sanitize_bool_lut_f,
         carry_extract_and_sanitize_bool_lut_f},
-        index_gen, true, {}, h_lut_indexes);
+        gpu_memory_allocated);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
    // The expanded LWEs will always be on the casting key format
    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
+        num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

    tmp_ksed_small_to_big_expanded_lwes =
        (Torus *)cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<Torus>(num_lwes,
-                                   casting_params.big_lwe_dimension + 1),
+            num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory);

@@ -340,11 +313,6 @@ template <typename Torus> struct zk_expand_mem {
    message_and_carry_extract_luts->release(streams);
    delete message_and_carry_extract_luts;

-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut->release(streams);
-      delete identity_lut;
-    }
-
    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
--- a/Show More
+++ b/Show More