chore(shortint): add ks32 & prod params to tests

Make KS32 params the default, add them to tests, and add a PARAM_PROD param alias which is still the KS-PBS so that it's still tested. Prod params are tested in a separate workflow to avoid growing the time tests take. BREAKING CHANGE: `PARAM_MESSAGE_2_CARRY_2` now returns `KeySwitch32PBSParameters` instead of `ClassicPBSParameters`.
chore: remove integer test filter which is not relevant anymore
2026-04-28 03:01:21 -04:00 · 2026-02-09 11:43:42 +01:00 · 2026-02-09 11:42:58 +01:00 · 2026-02-09 10:20:56 +01:00 · 2026-02-09 09:35:29 +01:00 · 2026-02-08 12:14:07 -03:00
423 changed files with 21637 additions and 6139 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -23,6 +23,10 @@ runs:
        echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
        sha256sum -c checksum
        sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
+
+        # Disable unattended-upgrades to avoid lock issues
+        sudo systemctl disable --now unattended-upgrades
+
        sudo apt-get clean
        sudo rm -rf /var/lib/apt/lists/*
        sudo apt update
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -0,0 +1,15 @@
+runners:
+  cpu-big:
+    family: m6i.32xlarge
+    image: cpu-tests-eu-west-3
+    volume: 200gb
+    spot: false
+  cpu-small:
+    family: m6i.4xlarge
+    image: cpu-tests-eu-west-3
+    volume: 200gb
+    spot: false
+
+images:
+  cpu-tests-eu-west-3:
+    ami: "ami-0a786ffdb1411fac4"  # Ubuntu 24.04
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -34,6 +34,9 @@ permissions:
 jobs:
  setup-instance:
    name: aws_tfhe_backward_compat_tests/setup-instance
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name != 'push'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -41,7 +44,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -66,7 +69,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'true' # Needed to pull lfs data
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +83,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -109,7 +112,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -141,7 +144,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -63,7 +63,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -219,7 +219,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            ~/.nvm
@@ -232,7 +232,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -299,7 +299,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -50,7 +50,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -86,7 +86,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
    timeout-minutes: 1440
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_param_prod_tests.yml
+++ b/.github/workflows/aws_tfhe_param_prod_tests.yml
@@ -0,0 +1,178 @@
+# Run a small subset of tests to ensure quick feedback.
+name: aws_tfhe_param_prod_tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]
+
+permissions:
+  contents: read
+
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
+
+jobs:
+  should-run:
+    name: aws_tfhe_param_prod_tests/should-run
+    if: (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read # Needed to check for file change
+    outputs:
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
+        with:
+          files_yaml: |
+            dependencies:
+              - tfhe/Cargo.toml
+              - tfhe-csprng/**
+              - tfhe-fft/**
+              - tfhe-zk-pok/**
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            versionable:
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' )
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
+  setup-instance:
+    name: aws_tfhe_param_prod_tests/setup-instance
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  param-prod-tests:
+    name: aws_tfhe_param_prod_tests/param-prod-tests
+    needs: [ should-run, setup-instance ]
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true'
+        run: |
+          make gen_key_cache
+
+      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
+        run: |
+          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=FALSE make test_param_prod_shortint_ci
+      - name: Set pull-request URL
+        if: ${{ failure() && github.event_name == 'pull_request' }}
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Slack Notification
+        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_param_prod_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, param-prod-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (param-prod-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -51,7 +51,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -72,7 +72,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -155,7 +155,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -39,7 +39,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +80,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            ~/.nvm
@@ -93,7 +93,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -147,7 +147,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -16,7 +16,8 @@ on:
          - integer_zk
          - shortint
          - shortint_oprf
-          - hlapi
+          - hlapi_unsigned
+          - hlapi_signed
          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -24,6 +24,7 @@ permissions: {}
 jobs:
  prepare-inputs:
    name: benchmark_cpu_weekly/prepare-inputs
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    outputs:
      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -72,8 +73,7 @@ jobs:

  run-benchmarks-integer:
    name: benchmark_cpu_weekly/run-benchmarks-integer
-    if: github.repository == 'zama-ai/tfhe-rs' 
-      && (needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -92,8 +92,7 @@ jobs:

  run-benchmarks-integer-zk-pke:
    name: benchmark_cpu_weekly/run-benchmarks-integer-zk-pke
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -111,8 +110,7 @@ jobs:

  run-benchmarks-hlapi-erc20:
    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -130,8 +128,7 @@ jobs:

  run-benchmarks-hlapi-dex:
    name: benchmark_cpu_weekly/run-benchmarks-hlapi-dex
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -149,8 +146,7 @@ jobs:

  run-benchmarks-core-crypto:
    name: benchmark_cpu_weekly/run-benchmarks-core-crypto
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -167,8 +163,7 @@ jobs:

  run-benchmarks-shortint:
    name: benchmark_cpu_weekly/run-benchmarks-shortint
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && (needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -186,8 +181,7 @@ jobs:

  run-benchmarks-boolean:
    name: benchmark_cpu_weekly/run-benchmarks-boolean
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -206,8 +200,7 @@ jobs:

  run-benchmarks-tfhe-zk-pok:
    name: benchmark_cpu_weekly/run-benchmarks-tfhe-zk-pok
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -25,10 +25,6 @@ on:
        description: "Generate SVG tables"
        type: boolean
        default: true
-      open-pr:
-        description: "Open a PR with the benchmark results"
-        type: boolean
-        default: false

 permissions: {}

@@ -166,54 +162,3 @@ jobs:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  open-pr:
-    name: benchmark-documentation/open-pr
-    needs: [ generate-svgs-with-benchmarks-run, generate-svgs-without-benchmarks-run ]
-    if: ${{ always() && inputs.open-pr &&
-      (needs.generate-svgs-with-benchmarks-run.result == 'success' || needs.generate-svgs-without-benchmarks-run.result == 'success') }}
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write # Needed to create a commit
-      pull-requests: write # Needed to open a pull-request
-    env:
-      PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
-        with:
-          persist-credentials: 'false'
-
-      - name: Download SVG tables
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
-        with:
-          path: svg_tables
-          merge-multiple: 'true'
-
-      # Perform best effort to copy SVG tables. If the copy fails or files don't exist, the PR will still be created.
-      - name: Copy SVG tables to documentation location
-        run: |
-          cp -f svg_tables/*integer-benchmark*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-          cp -f svg_tables/*pbs-benchmark-tuniform*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-          cp -f svg_tables/cpu-gpu-hpu-integer-benchmark-fheuint64-tuniform-2m128-ciphertext.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-
-      - name: Get current date
-        id: get-date
-        run: |
-          echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
-
-      - name: Create pull-request
-        uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
-        with:
-          sign-commits: true # Commit will be signed by github-actions bot
-          add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
-          branch: gh-bot/docs/update-svg-tables-${{ steps.get-date.outputs.date }}
-          commit-message: |
-            chore(docs): update benchmark results for all backends
-
-            Automated documentation update from tfhe-rs CI pipeline.
-          title: |
-            [CI] chore(docs): update benchmark results for all backends
-          body: |
-            Documentation update triggered by GitHub workflow.
-          labels: documentation
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -123,7 +123,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -50,6 +50,8 @@ env:
 jobs:
  parse-inputs:
    name: benchmark_gpu_coprocessor/parse-inputs
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      contents: 'read'
@@ -92,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +132,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -141,7 +143,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -192,10 +194,10 @@ jobs:
          cargo install sqlx-cli

      - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
+        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
+        uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
        with:
          path: |
            ~/.cargo/registry
@@ -223,7 +225,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
+        uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # v6.2.0
        with:
          node-version: 20.x

@@ -299,7 +301,7 @@ jobs:
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +326,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -25,6 +25,7 @@ permissions: {}
 jobs:
  prepare-inputs:
    name: benchmark_cpu_weekly/prepare-inputs
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    outputs:
      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -49,8 +50,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-integer:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -72,8 +72,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-integer-compression:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -95,8 +94,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-integer-zk-aes:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk-aes
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -118,8 +116,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-noise-squash:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -141,8 +138,7 @@ jobs:

  run-benchmarks-1-h100-core-crypto:
    name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -166,8 +162,7 @@ jobs:

  run-benchmarks-1-h100-erc20:
    name: benchmark_gpu_weekly/run-benchmarks-1-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -187,8 +182,7 @@ jobs:

  run-benchmarks-2-h100-erc20:
    name: benchmark_gpu_weekly/run-benchmarks-2-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -208,8 +202,7 @@ jobs:

  run-benchmarks-8-h100-erc20:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -233,8 +226,7 @@ jobs:

  run-benchmarks-1-h100-dex:
    name: benchmark_gpu_weekly/run-benchmarks-1-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -254,8 +246,7 @@ jobs:

  run-benchmarks-2-h100-dex:
    name: benchmark_gpu_weekly/run-benchmarks-2-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -275,8 +266,7 @@ jobs:

  run-benchmarks-8-h100-dex:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -12,7 +12,8 @@ on:
        default: integer
        options:
          - integer
-          - hlapi
+          - hlapi_unsigned
+          - hlapi_signed
          - hlapi_erc20
      op_flavor:
        description: "Operations set to run"
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -126,7 +126,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -50,7 +50,7 @@ jobs:
      pull-requests: write # Needed to write a comment in a pull-request
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0  # Needed to get commit hash
          persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -305,13 +305,13 @@ jobs:
      REF_NAME: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install recent Python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.12'
          pip-install: -r ci/data_extractor/requirements.txt -r ci/perf_regression/requirements.txt
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -31,13 +31,16 @@ permissions: {}
 jobs:
  setup-instance:
    name: benchmark_tfhe_fft/setup-instance
+    if:
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -73,7 +76,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly
          override: true
@@ -102,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -31,13 +31,16 @@ permissions: {}
 jobs:
  setup-instance:
    name: benchmark_tfhe_ntt/setup-instance
+    if:
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -73,7 +76,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly
          override: true
@@ -102,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -31,15 +31,14 @@ jobs:
    name: benchmark_wasm_client/should-run
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
      pull-requests: read  # Needed to check for file change
    outputs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -71,7 +70,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +90,7 @@ jobs:
        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -119,7 +118,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        with:
          path: |
            ~/.nvm
@@ -132,7 +131,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -181,7 +180,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -213,7 +212,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -24,9 +24,11 @@ permissions: {}
 jobs:
  audit:
    name: cargo_audit/audit
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -24,7 +24,7 @@ jobs:
    outputs:
      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
      result: ${{ steps.set_builds_result.outputs.result }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -26,13 +26,13 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
          override: true
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -24,13 +24,13 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
          override: true
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -2,6 +2,7 @@
 name: cargo_test_fft

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -22,6 +23,8 @@ permissions:
 jobs:
  should-run:
    name: cargo_test_fft/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -29,7 +32,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -56,13 +59,13 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
          override: true
@@ -92,7 +95,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -2,6 +2,7 @@
 name: cargo_test_ntt

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -24,6 +25,8 @@ permissions:
 jobs:
  should-run:
    name: cargo_test_ntt/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -31,7 +34,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -60,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,13 +90,13 @@ jobs:
        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
          override: true
@@ -143,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -43,7 +43,7 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
+        uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -31,7 +31,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -37,7 +37,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -43,7 +43,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'

--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -19,8 +19,8 @@ on:
  pull_request:
    types: [ labeled ]
  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"
+   # Every other day at 1AM
+   - cron: "0 1 */2 * *"

 permissions:
  contents: read
@@ -37,11 +37,11 @@ jobs:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ["self-hosted", "4090-desktop"]
-    timeout-minutes: 1440 # 24 hours
+    timeout-minutes: 2880 # 48 hours

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -23,8 +23,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # every month
-    - cron: "0 0 1 * *"
+    # every friday noon
+    - cron: "0 12 * * 5"

 permissions:
  contents: read
@@ -35,15 +35,15 @@ jobs:
  setup-instance:
    name: gpu_code_validation_tests/setup-instance
    runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: gpu_fast_h100_tests
+name: gpu_core_h100_tests

 env:
  CARGO_TERM_COLOR: always
@@ -32,7 +32,7 @@ permissions:

 jobs:
  should-run:
-    name: gpu_fast_h100_tests/should-run
+    name: gpu_core_h100_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -61,15 +61,14 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
-              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
-              - '.github/workflows/gpu_fast_h100_tests.yml'
+              - '.github/workflows/gpu_core_h100_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml

  setup-instance:
-    name: gpu_fast_h100_tests/setup-instance
+    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -87,7 +86,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,7 +110,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: gpu_fast_h100_tests/cuda-tests-linux
+    name: gpu_core_h100_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -129,7 +128,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -155,20 +154,8 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

-      - name: Run user docs tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
-
  slack-notify:
-    name: gpu_fast_h100_tests/slack-notify
+    name: gpu_core_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -187,10 +174,10 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+          SLACK_MESSAGE: "Core H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: gpu_fast_h100_tests/teardown-instance
+    name: gpu_core_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
@@ -198,7 +185,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -39,7 +39,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -79,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -151,7 +151,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          make test_high_level_api_gpu_fast

  slack-notify:
    name: gpu_fast_tests/slack-notify
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,7 +68,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,7 +154,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          make test_high_level_api_gpu_fast

  slack-notify:
    name: gpu_full_multi_gpu_tests/slack-notify
@@ -187,7 +187,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -0,0 +1,209 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: gpu_hlapi_h100_tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]
+
+permissions:
+  contents: read
+
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
+
+jobs:
+  should-run:
+    name: gpu_hlapi_h100_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
+              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml
+
+  setup-instance:
+    name: gpu_hlapi_h100_tests/setup-instance
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cuda-tests-linux:
+    name: gpu_hlapi_h100_tests/cuda-tests-linux
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11 
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
+      
+      - name: Run user docs tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
+
+  slack-notify:
+    name: gpu_hlapi_h100_tests/slack-notify
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Set pull-request URL
+        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Send message
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "HL API H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: gpu_hlapi_h100_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,7 +65,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -74,7 +74,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -2,6 +2,7 @@
 name: hpu_hlapi_tests

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -25,6 +26,8 @@ permissions: {}
 jobs:
  should-run:
    name: hpu_hlapi_tests/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -32,7 +35,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -62,7 +65,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,13 +86,13 @@ jobs:
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
          override: true
@@ -114,7 +117,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -34,7 +34,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -41,7 +41,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -52,7 +52,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -75,6 +75,7 @@ jobs:
    name: make_release_common/provenance
    if: ${{ !inputs.dry-run  }}
    needs: package
+    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      actions: read # Needed to detect the GitHub Actions environment
@@ -93,7 +94,7 @@ jobs:
      id-token: write # Needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -117,6 +117,7 @@ jobs:
    name: make_release_cuda/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
+    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      actions: read # Needed to detect the GitHub Actions environment
@@ -221,7 +222,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -68,7 +68,7 @@ jobs:
      id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -85,14 +85,14 @@ jobs:
          make build_web_js_api_parallel

      - name: Authenticate on NPM
-        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
+        uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # v6.2.0
        with:
          node-version: '24'
          registry-url: 'https://registry.npmjs.org'

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@7f8fe47b3bea1be0c3aec2b717c5ec1f3e03410b
+        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@7f8fe47b3bea1be0c3aec2b717c5ec1f3e03410b
+        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -30,7 +30,7 @@ jobs:
    name: parameters_check/setup-instance
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch'
+      github.event_name != 'push'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          toolchain: stable

      - name: Checkout lattice-estimator
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/placeholder_workflow.yml
+++ b/.github/workflows/placeholder_workflow.yml
@@ -1,18 +1,179 @@
 # Placeholder workflow file allowing running it without having to merge to main first
 name: placeholder_workflow

+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+
 on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]

-permissions: {}
+permissions:
+  contents: read

-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  placeholder:
-    name: placeholder_workflow/placeholder
+  should-run:
+    name: aws_tfhe_param_prod_tests/should-run
+    if: (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
-
+    permissions:
+      pull-requests: read # Needed to check for file change
+    outputs:
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
-      - run: |
-          echo "Hello this is a Placeholder Workflow"
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
+        with:
+          files_yaml: |
+            dependencies:
+              - tfhe/Cargo.toml
+              - tfhe-csprng/**
+              - tfhe-fft/**
+              - tfhe-zk-pok/**
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            versionable:
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' )
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
+
+  setup-instance:
+    name: aws_tfhe_param_prod_tests/setup-instance
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  param-prod-tests:
+    name: aws_tfhe_param_prod_tests/param-prod-tests
+    needs: [ should-run, setup-instance ]
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true'
+        run: |
+          make gen_key_cache
+
+      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
+        run: |
+          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=FALSE make test_param_prod_shortint_ci
+      - name: Set pull-request URL
+        if: ${{ failure() && github.event_name == 'pull_request' }}
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Slack Notification
+        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: aws_tfhe_param_prod_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, param-prod-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (param-prod-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/pr_milestone_check.yml
+++ b/.github/workflows/pr_milestone_check.yml
@@ -1,67 +0,0 @@
-name: pr_milestone_check
-
-on:
-  pull_request:
-    types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-# external contributors workflows are manually approved
-
-jobs:
-  check-empty-milestone:
-    name: pr_milestone_check/check-empty-milestone
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone == null
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone Missing
-
-            Please assign a milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is missing. This check is failing."
-          exit 1
-
-  check-milestone-open:
-    name: pr_milestone_check/check-milestone-open
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone is closed
-
-            Please assign an open milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is closed. This check is failing."
-          exit 1
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -30,7 +30,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
+          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -47,6 +47,8 @@ jobs:

          echo ">>> Pushing all LFS items..."
          git lfs push --all destination "${DESTINATION_BRANCH}"
+          
+          shred --remove .git/config

      - name: git-sync-tags
        env:
@@ -59,7 +61,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
+          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -70,3 +72,5 @@ jobs:

          echo ">>> Pushing git changes..."
          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
+          
+          shred --remove .git/config
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -12,6 +12,7 @@ permissions: {}
 jobs:
  stale:
    name: unverified_prs/stale
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    permissions:
      issues: read # Needed to fetch all issues
--- a/82
+++ b/82
@@ -29,8 +29,9 @@ WASM_PACK_VERSION="0.13.1"
 WASM_BINDGEN_VERSION:=$(shell cargo tree --target wasm32-unknown-unknown -e all --prefix none | grep "wasm-bindgen v" | head -n 1 | cut -d 'v' -f2)
 WEB_RUNNER_DIR=web-test-runner
 WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
-TYPOS_VERSION=1.39.0
-ZIZMOR_VERSION=1.16.2
+TAPLO_VERSION=0.10.0
+TYPOS_VERSION=1.42.0
+ZIZMOR_VERSION=1.20.0
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
 # copy paste the command in the terminal and change them if required without forgetting the flags
 export RUSTFLAGS?=-C target-cpu=native
@@ -171,6 +172,10 @@ install_cargo_dylint:
 install_cargo_audit:
 	cargo install --locked cargo-audit

+.PHONY: install_taplo # Check Cargo.toml format
+install_taplo:
+	@./scripts/install_taplo.sh --taplo-version $(TAPLO_VERSION)
+
 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker:
 	@./scripts/install_typos.sh --typos-version $(TYPOS_VERSION)
@@ -283,6 +288,10 @@ fmt_gpu: install_rs_check_toolchain
 fmt_c_tests:
 	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format -style=file -i {} \;

+.PHONY: fmt_toml # Format TOML files
+fmt_toml: install_taplo
+	taplo fmt
+
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
@@ -307,6 +316,11 @@ check_fmt_js: check_nvm_installed
 	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt && \
 	$(MAKE) -C tfhe/js_on_wasm_tests check_fmt

+.PHONY: check_fmt_toml # Check TOML files format
+check_fmt_toml: install_taplo
+	@RUST_LOG=warn taplo fmt --check || \
+	echo "TOML files format check failed. Please run 'make fmt_toml'"
+
 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
 	@typos && echo "No typos found"
@@ -719,11 +733,12 @@ test_core_crypto_gpu:
 		--features=gpu -p tfhe -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4
+test_integer_gpu: install_cargo_nextest
+	TEST_THREADS=2 \
+	DOCTEST_THREADS=4 \
+		./scripts/integer-tests.sh \
+		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
+		--tfhe-package "tfhe" --all-but-noise

 .PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
 test_integer_gpu_debug:
@@ -905,6 +920,14 @@ test_shortint_ci: install_cargo_nextest
 		./scripts/shortint-tests.sh \
 		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "tfhe"

+.PHONY: test_param_prod_shortint_ci # Run the tests for shortint ci
+test_param_prod_shortint_ci: install_cargo_nextest
+	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
+	FAST_TESTS="$(FAST_TESTS)" \
+		./scripts/shortint-tests.sh \
+		--cargo-profile "$(CARGO_PROFILE)" --run-prod-only --tfhe-package "tfhe"
+
+
 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -1035,10 +1058,16 @@ test_high_level_api:
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

-test_high_level_api_gpu: install_cargo_nextest
+test_high_level_api_gpu_fast: install_cargo_nextest # Run all the GPU tests for high_level_api except test_uniformity for oprf which is too long
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-		-E "test(/high_level_api::.*gpu.*/)"
+	  -E "test(/high_level_api::.*gpu.*/) and not test(/uniformity/)"
+
+
+test_high_level_api_gpu: install_cargo_nextest # Run all the GPU tests for high_level_api
+	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
+		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+  	-E "test(/high_level_api::.*gpu.*/)"

 test_list_gpu: install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest list --cargo-profile $(CARGO_PROFILE) \
@@ -1357,6 +1386,9 @@ clippy_bench: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+	  --features=shortint,internal-keycache \
+		-p tfhe-benchmark -- --no-deps -D warnings

 .PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
 clippy_bench_gpu: install_rs_check_toolchain
@@ -1391,14 +1423,14 @@ bench_signed_integer: install_rs_check_toolchain

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
 bench_signed_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1435,7 +1467,7 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
@@ -1461,6 +1493,13 @@ bench_integer_trivium_gpu: install_rs_check_toolchain
 	--bench integer-trivium \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

+.PHONY: bench_integer_kreyvium_gpu # Run benchmarks for kreyvium on GPU backend
+bench_integer_kreyvium_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-kreyvium \
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1495,7 +1534,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1641,11 +1680,18 @@ bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_unsafe_coop_firefox

-.PHONY: bench_hlapi # Run benchmarks for integer operations
-bench_hlapi: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
+.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
+bench_hlapi_unsigned: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi \
+	--bench hlapi_unsigned \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_signed # Run benchmarks for signed integer operations
+bench_hlapi_signed: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi_signed \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
@@ -1846,6 +1892,7 @@ pcc_batch_1:
 	$(call run_recipe_with_details,no_dbg_log)
 	$(call run_recipe_with_details,check_parameter_export_ok)
 	$(call run_recipe_with_details,check_fmt)
+	$(call run_recipe_with_details,check_fmt_toml)
 	$(call run_recipe_with_details,check_typos)
 	$(call run_recipe_with_details,lint_doc)
 	$(call run_recipe_with_details,check_md_docs_are_tested)
@@ -1921,6 +1968,7 @@ fpcc:
 	$(call run_recipe_with_details,no_dbg_log)
 	$(call run_recipe_with_details,check_parameter_export_ok)
 	$(call run_recipe_with_details,check_fmt)
+	$(call run_recipe_with_details,check_fmt_toml)
 	$(call run_recipe_with_details,check_typos)
 	$(call run_recipe_with_details,lint_doc)
 	$(call run_recipe_with_details,check_md_docs_are_tested)
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -138,9 +138,9 @@ Example code:
 ```rust
 use tfhe::shortint::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trans_ciphering/mod.rs
+++ b/apps/trivium/src/trans_ciphering/mod.rs
@@ -7,7 +7,7 @@ use tfhe::prelude::*;
 use tfhe::shortint::Ciphertext;
 use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};

-/// Triat specifying the interface for trans ciphering a FheUint64 object. Since it is meant
+/// Trait specifying the interface for trans ciphering a FheUint64 object. Since it is meant
 /// to be used with stream ciphers, encryption and decryption are by default the same.
 pub trait TransCiphering {
    fn trans_encrypt_64(&mut self, cipher: FheUint64) -> FheUint64;
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_5_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_6_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_5_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_6_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_5_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_6_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -87,6 +87,7 @@ fn main() {
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
            "cuda/include/trivium/trivium.h",
+            "cuda/include/kreyvium/kreyvium.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -10,11 +10,7 @@ extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
 extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
-extern "C" {
-int32_t cuda_setup_multi_gpu(int device_0_id);
-}
-
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
 // Define a variant type that can be either a vector or a single pointer
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,

 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type);
+uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
+                                   PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -70,7 +68,7 @@ public:
  // Construct an empty set. Invalid use of an empty set should raise an error
  // right away through asserts or because of a nullptr dereference
  CudaStreams()
-      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
+      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count(0),
        _owns_streams(false) {}

  // Returns a subset of this set as an active subset. An active subset is one
@@ -80,7 +78,15 @@ public:
        _streams, _gpu_indexes,
        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }
-
+  // Returns a subset of this set as an active subset for pbs128. An active
+  // subset is one that is temporarily used to perform some computation. For
+  // pbs128, the threshold is different, because the original threshold was
+  // designed for 2_2 params.
+  CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
+  }
  // Returns a CudaStreams struct containing only the ith stream
  CudaStreams get_ith(int i) const {
    return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -114,11 +120,13 @@ public:
  // streams on the same GPU
  void create_on_same_gpus(const CudaStreams &other) {
    PANIC_IF_FALSE(_streams == nullptr,
-                   "Assign clone to non-empty cudastreams");
+                   "Cuda error: Assign clone to non-empty CudaStreams");
+    PANIC_IF_FALSE(_gpu_count <= 8,
+                   "Cuda error: GPU count should be in the interval [0, 8]");

    cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];

-    uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
+    uint32_t *gpu_indexes_clone = new uint32_t[other._gpu_count];
    for (uint32_t i = 0; i < other._gpu_count; ++i) {
      new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
      gpu_indexes_clone[i] = other._gpu_indexes[i];
@@ -170,6 +178,7 @@ public:
      _streams = nullptr;
      delete[] _gpu_indexes;
      _gpu_indexes = nullptr;
+      _gpu_count = 0;
    }
  }

@@ -483,4 +492,38 @@ public:
  }
 };

+// Event pool for managing temporary CUDA events in scatter/gather operations
+struct CudaEventPool {
+private:
+  std::vector<cudaEvent_t> _events;
+  std::vector<uint32_t> _gpu_indices;
+
+public:
+  CudaEventPool() {}
+
+  // Requests a new event from the pool (creates and stores it)
+  cudaEvent_t request_event(uint32_t gpu_index) {
+    cudaEvent_t event = cuda_create_event(gpu_index);
+    _events.push_back(event);
+    _gpu_indices.push_back(gpu_index);
+    return event;
+  }
+
+  // Releases all pooled events
+  // This should always be called in the release of the LUT, so streams
+  // are already synchronized
+  void release() {
+    for (size_t i = 0; i < _events.size(); i++) {
+      cuda_event_destroy(_events[i], _gpu_indices[i]);
+    }
+    _events.clear();
+    _gpu_indices.clear();
+  }
+
+  ~CudaEventPool() {
+    GPU_ASSERT(_events.empty(),
+               "CudaEventPool: must call release before destruction");
+  }
+};
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -45,12 +45,9 @@ template <typename Torus> struct boolean_bitop_buffer {

        // BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
        // only lut for degree = 1 is generated
-        generate_device_accumulator_bivariate_with_factor<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+        lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
+                                                  {lut_bivariate_f},
+                                                  gpu_memory_allocated, {}, 2);
      }
      break;
    default:
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    Torus total_modulus = params.message_modulus * params.carry_modulus;
    uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

-    int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
+    int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
    tmp_out = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
@@ -144,7 +144,6 @@ template <typename Torus> struct int_comparison_eq_buffer {

      operator_lut->generate_and_broadcast_bivariate_lut(
          active_streams, {0}, {operator_f}, gpu_memory_allocated);
-      // operator_lut->broadcast_lut(active_streams);
    } else {
      operator_lut = nullptr;
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
  uint32_t lwe_per_glwe;
+  uint32_t max_num_glwes;

+  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
+  // compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
+  // defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

+    // Calculate the actual number of GLWEs needed based on total radix blocks.
+    // This ensures we allocate enough memory when num_radix_blocks >
+    // lwe_per_glwe.
+    max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
+
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
+            max_num_glwes * glwe_accumulator_size * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

@@ -106,19 +116,14 @@ template <typename Torus> struct int_decompression {
          encryption_params.carry_modulus;
      auto effective_compression_carry_modulus = 1;

-      generate_device_accumulator_with_encoding<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          decompression_rescale_lut->get_lut(0, 0),
-          decompression_rescale_lut->get_degree(0),
-          decompression_rescale_lut->get_max_degree(0),
-          encryption_params.glwe_dimension, encryption_params.polynomial_size,
+      auto active_streams = streams.active_gpu_subset(
+          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
+      decompression_rescale_lut->generate_and_broadcast_lut_with_encoding(
+          active_streams, {0}, {decompression_rescale_f},
          effective_compression_message_modulus,
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
-          decompression_rescale_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(
-          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
-      decompression_rescale_lut->broadcast_lut(active_streams);
+          gpu_memory_allocated);
    }
  }
  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -1045,24 +1045,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0),
-        zero_out_if_overflow_did_not_happen[0]->get_degree(0),
-        zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
-        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(active_streams);
-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
-        zero_out_if_overflow_did_not_happen[1]->get_degree(0),
-        zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
-        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);
+    zero_out_if_overflow_did_not_happen[0]
+        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
+                                               gpu_memory_allocated, {},
+                                               params.message_modulus - 2);
+    zero_out_if_overflow_did_not_happen[1]
+        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
+                                               gpu_memory_allocated, {},
+                                               params.message_modulus - 1);

    // create and generate zero_out_if_overflow_happened
    zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -1079,24 +1069,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_happened[0]->get_lut(0, 0),
-        zero_out_if_overflow_happened[0]->get_degree(0),
-        zero_out_if_overflow_happened[0]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
-        gpu_memory_allocated);
-    zero_out_if_overflow_happened[0]->broadcast_lut(active_streams);
-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_happened[1]->get_lut(0, 0),
-        zero_out_if_overflow_happened[1]->get_degree(0),
-        zero_out_if_overflow_happened[1]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
-        gpu_memory_allocated);
-    zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);
+    zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {overflow_happened_f}, gpu_memory_allocated, {},
+        params.message_modulus - 2);
+    zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {overflow_happened_f}, gpu_memory_allocated, {},
+        params.message_modulus - 1);

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -9,7 +9,7 @@
 #include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>
-#include <map>
+#include <optional>
 #include <queue>

 #include <stdio.h>
@@ -35,8 +35,10 @@ public:
    } else if ((msg_mod) == 4 && (carry_mod) == 4) {                           \
      constexpr int max_noise_level = 5;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
-        PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 "    \
-              "parameters");                                                   \
+        PANIC(                                                                 \
+            "Cuda error: noise %d exceeds maximum authorized value 5 for 2_2"  \
+            " parameters",                                                     \
+            noise_level_expr);                                                 \
    } else if ((msg_mod) == 8 && (carry_mod) == 8) {                           \
      constexpr int max_noise_level = 9;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
@@ -102,7 +104,7 @@ template <typename Torus>
 void generate_device_accumulator_no_encoding(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
+    uint32_t polynomial_size, std::function<Torus(Torus)> f,
    bool gpu_memory_allocated);

 /*
@@ -126,6 +128,31 @@ void generate_many_lut_device_accumulator(
    uint32_t message_modulus, uint32_t carry_modulus,
    std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);

+template <typename Torus>
+void generate_device_accumulator_with_encoding_with_cpu_prealloc(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
+    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_message_modulus, uint32_t input_carry_modulus,
+    uint32_t output_message_modulus, uint32_t output_carry_modulus,
+    std::function<Torus(Torus)> f, bool gpu_memory_allocated,
+    Torus *preallocated_h_lut);
+
+template <typename Torus>
+void generate_device_accumulator_with_cpu_prealloc(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
+    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus)> f, bool gpu_memory_allocated,
+    Torus *preallocated_h_lut);
+
+template <typename Torus>
+void generate_device_accumulator_bivariate_with_cpu_prealloc(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
+    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
+    Torus *h_lut);
+
 struct radix_columns {
  std::vector<uint32_t> columns_counter;
  uint32_t num_blocks;
@@ -350,6 +377,7 @@ struct int_radix_lut_custom_input_output {

  CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
  CudaStreamsBarrier multi_gpu_gather_barrier;
+  CudaEventPool event_pool;

  // Setup the LUT configuration:
  // input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
@@ -372,8 +400,13 @@ struct int_radix_lut_custom_input_output {
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

-    this->active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    if (sizeof(OutputTorus) == 16) {
+      this->active_streams =
+          streams.active_gpu_subset_u128(num_radix_blocks, params.pbs_type);
+    } else {
+      this->active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    }
  }

  void setup_degrees() {
@@ -385,9 +418,13 @@ struct int_radix_lut_custom_input_output {
  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {

+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
@@ -459,11 +496,14 @@ struct int_radix_lut_custom_input_output {
                               lwe_trivial_indexes, num_radix_blocks,
                               allocate_gpu_memory);
  }
-
  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    auto inputs_on_gpu = std::min(
        (int)num_input_blocks,
@@ -810,10 +850,13 @@ struct int_radix_lut_custom_input_output {
  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {
-
+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
@@ -839,51 +882,131 @@ struct int_radix_lut_custom_input_output {
  void generate_and_broadcast_lut(
      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
      std::vector<std::function<OutputTorus(OutputTorus)>> f,
+      bool gpu_memory_allocated, bool use_encoding = true,
+      std::vector<OutputTorus *> cpu_prealloc_buffers = {}) {
+    // streams should be a subset of active_streams
+
+    GPU_ASSERT(
+        cpu_prealloc_buffers.empty() || (use_encoding && gpu_memory_allocated),
+        "LUT Generation with pre-allocated CPU buffer only supports generation "
+        "with encoding and expects gpu_memory_allocated==True ");
+
+    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
+      if (use_encoding) {
+        if (cpu_prealloc_buffers.empty()) {
+          generate_device_accumulator<OutputTorus>(
+              streams.stream(0), streams.gpu_index(0),
+              get_lut(0, lut_indexes[i]), get_degree(lut_indexes[i]),
+              get_max_degree(lut_indexes[i]), params.glwe_dimension,
+              params.polynomial_size, params.message_modulus,
+              params.carry_modulus, f[i], gpu_memory_allocated);
+        } else {
+          generate_device_accumulator_with_cpu_prealloc<OutputTorus>(
+              streams.stream(0), streams.gpu_index(0),
+              get_lut(0, lut_indexes[i]), get_degree(lut_indexes[i]),
+              get_max_degree(lut_indexes[i]), params.glwe_dimension,
+              params.polynomial_size, params.message_modulus,
+              params.carry_modulus, f[i], true, cpu_prealloc_buffers[i]);
+        }
+      } else {
+        generate_device_accumulator_no_encoding<OutputTorus>(
+            streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
+            get_degree(lut_indexes[i]), params.message_modulus,
+            params.carry_modulus, params.glwe_dimension, params.polynomial_size,
+            f[i], gpu_memory_allocated);
+      }
+    }
+    broadcast_lut(streams);
+  }
+
+  // Generate and broadcast LUT with custom input/output encoding parameters.
+  // This is useful when the input and output message/carry modulus differ,
+  // such as in decompression rescaling.
+  void generate_and_broadcast_lut_with_encoding(
+      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
+      std::vector<std::function<OutputTorus(OutputTorus)>> f,
+      uint32_t input_message_modulus, uint32_t input_carry_modulus,
+      uint32_t output_message_modulus, uint32_t output_carry_modulus,
+      bool gpu_memory_allocated,
+      std::vector<OutputTorus *> cpu_prealloc_buffers = {}) {
+
+    GPU_ASSERT(cpu_prealloc_buffers.empty() || gpu_memory_allocated,
+               "LUT Generation with pre-allocated CPU buffer expects "
+               "gpu_memory_allocated==True ");
+
+    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
+      if (cpu_prealloc_buffers.empty()) {
+        generate_device_accumulator_with_encoding<OutputTorus>(
+            streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
+            get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
+            params.glwe_dimension, params.polynomial_size,
+            input_message_modulus, input_carry_modulus, output_message_modulus,
+            output_carry_modulus, f[i], gpu_memory_allocated);
+      } else {
+        generate_device_accumulator_with_encoding_with_cpu_prealloc<
+            OutputTorus>(
+            streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
+            get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
+            params.glwe_dimension, params.polynomial_size,
+            input_message_modulus, input_carry_modulus, output_message_modulus,
+            output_carry_modulus, f[i], true, cpu_prealloc_buffers[i]);
+      }
+    }
+    broadcast_lut(streams);
+  }
+
+  void generate_and_broadcast_many_lut(
+      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
+      std::vector<std::vector<std::function<OutputTorus(OutputTorus)>>>
+          funcs_many_lut,
      bool gpu_memory_allocated) {
    // streams should be a subset of active_streams

    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
-      generate_device_accumulator<OutputTorus>(
+      generate_many_lut_device_accumulator<OutputTorus>(
          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, f[i], gpu_memory_allocated);
+          params.carry_modulus, funcs_many_lut[i], gpu_memory_allocated);
    }
-   //broadcast_lut(streams);
+    broadcast_lut(streams);
  }

  void generate_and_broadcast_bivariate_lut(
      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
      std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
-      bool gpu_memory_allocated) {
+      bool gpu_memory_allocated,
+      std::vector<OutputTorus *> cpu_prealloc_buffers = {},
+      std::optional<int> factor = std::nullopt) {
    // streams should be a subset of active_streams

-    /*    for (int fidx = 0; fidx < f.size(); ++fidx) {
-          __int128_t f_hash = 0;
-          uint32_t bits_per_lut_val = 5;
-          uint32_t input_modulus_sup =
-              params.message_modulus * params.carry_modulus;
-          for (uint32_t i = 0; i < input_modulus_sup; ++i) {
-            OutputTorus f_eval =
-                f[fidx](i / params.message_modulus, i % params.message_modulus);
-            GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
-                       "LUT value expected bitwidth overflow");
-            f_hash |= f_eval;
-            f_hash <<= bits_per_lut_val;
-          }
-          printf("%016llX%016llX\n",
-                 (unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
-                 (unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
-        }
-    */
    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
-      generate_device_accumulator_bivariate<InputTorus>(
-          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
-          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, f[i], gpu_memory_allocated);
+      if (cpu_prealloc_buffers.empty()) {
+        if (factor.has_value()) {
+          generate_device_accumulator_bivariate_with_factor<OutputTorus>(
+              streams.stream(0), streams.gpu_index(0),
+              get_lut(0, lut_indexes[i]), get_degree(lut_indexes[i]),
+              get_max_degree(lut_indexes[i]), params.glwe_dimension,
+              params.polynomial_size, params.message_modulus,
+              params.carry_modulus, f[i], factor.value(), gpu_memory_allocated);
+        } else {
+          generate_device_accumulator_bivariate<OutputTorus>(
+              streams.stream(0), streams.gpu_index(0),
+              get_lut(0, lut_indexes[i]), get_degree(lut_indexes[i]),
+              get_max_degree(lut_indexes[i]), params.glwe_dimension,
+              params.polynomial_size, params.message_modulus,
+              params.carry_modulus, f[i], gpu_memory_allocated);
+        }
+      } else {
+        generate_device_accumulator_bivariate_with_cpu_prealloc<OutputTorus>(
+            streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
+            get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
+            params.glwe_dimension, params.polynomial_size,
+            params.message_modulus, params.carry_modulus, f[i], true,
+            cpu_prealloc_buffers[i]);
+      }
    }
-    //broadcast_lut(streams);
+    broadcast_lut(streams);
  }

  void release(CudaStreams streams) {
@@ -916,6 +1039,7 @@ struct int_radix_lut_custom_input_output {

    if (active_streams.count() > 1) {
      active_streams.synchronize();
+      event_pool.release();
      multi_gpu_gather_barrier.release();
      multi_gpu_broadcast_barrier.release();
      multi_gpu_scatter_barrier.release();
@@ -1003,13 +1127,8 @@ struct int_noise_squashing_lut
    // lut for the squashing
    auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };

-    generate_device_accumulator<__uint128_t>(
-        this->active_streams.stream(0), this->active_streams.gpu_index(0),
-        this->get_lut(0, 0), this->get_degree(0), this->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, f_squash, allocate_gpu_memory);
-
-    this->broadcast_lut(this->active_streams);
+    this->generate_and_broadcast_lut(this->active_streams, {0}, {f_squash},
+                                     allocate_gpu_memory);
  }

  using int_radix_lut_custom_input_output<InputTorus, __uint128_t>::release;
@@ -1066,7 +1185,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {

    lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
                                    gpu_memory_allocated);
-    // lut->broadcast_lut(active_streams);

    /**
     * the input indexes should take the first bits_per_block PBS to target
@@ -1298,10 +1416,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    }

    if (allocated_luts_message_carry) {
-
-      auto message_acc = luts_message_carry->get_lut(0, 0);
-      auto carry_acc = luts_message_carry->get_lut(0, 1);
-
      // define functions for each accumulator
      auto lut_f_message = [message_modulus](Torus x) -> Torus {
        return x % message_modulus;
@@ -1449,7 +1563,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
                            uint32_t group_size, uint32_t big_lwe_size_bytes,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
-
    grouping_size = group_size;
    group_resolved_carries = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -1481,7 +1594,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
    lut_sequential_algorithm->generate_and_broadcast_lut(
        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
-    // lut_sequential_algorithm->broadcast_lut(active_streams);
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_seq_lut_indexes);
  }

@@ -1553,10 +1667,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    shifted_blocks_and_states = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -1590,17 +1701,15 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        return OUTPUT_CARRY::NONE;
      }
    };
+
+    std::vector<std::vector<std::function<Torus(Torus)>>> luts_array_funcs;
+    std::vector<uint32_t> lut_func_indexes;
+
    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
        f_first_block_state, f_shift_block};

-    auto first_block_lut = luts_array_first_step->get_lut(0, 0);
-    auto first_block_lut_degrees = luts_array_first_step->get_degree(0);
-    auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0);
-    generate_many_lut_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), first_block_lut,
-        first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
-        gpu_memory_allocated);
+    luts_array_funcs.push_back(f_first_grouping_luts);
+    lut_func_indexes.push_back(0);

    // luts for other blocks of the first grouping
    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
@@ -1617,13 +1726,9 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
      };
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};
-      auto lut = luts_array_first_step->get_lut(0, lut_id);
-      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
-      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
-      generate_many_lut_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
-          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
-          carry_modulus, f_grouping_luts, gpu_memory_allocated);
+
+      luts_array_funcs.push_back(f_grouping_luts);
+      lut_func_indexes.push_back(lut_id);
    }

    // luts for the rest of groupings (except for the last block)
@@ -1643,13 +1748,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};

-      auto lut = luts_array_first_step->get_lut(0, lut_id);
-      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
-      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
-      generate_many_lut_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
-          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
-          carry_modulus, f_grouping_luts, gpu_memory_allocated);
+      luts_array_funcs.push_back(f_grouping_luts);
+      lut_func_indexes.push_back(lut_id);
    }

    // For the last block we need to generate a new lut
@@ -1662,19 +1762,11 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {

    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step

-    auto last_block_lut = luts_array_first_step->get_lut(0, lut_id);
-    auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id);
-    auto last_block_lut_max_degree =
-        luts_array_first_step->get_max_degree(lut_id);
-
    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
        f_last_block_state, f_shift_block};

-    generate_many_lut_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), last_block_lut,
-        last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
-        gpu_memory_allocated);
+    luts_array_funcs.push_back(f_last_grouping_luts);
+    lut_func_indexes.push_back(lut_id);

    // Generate the indexes to switch between luts within the pbs
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
@@ -1703,10 +1795,12 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    // Do I need to do something else for the multi-gpu?
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array_first_step->broadcast_lut(active_streams);
+    luts_array_first_step->generate_and_broadcast_many_lut(
+        active_streams, lut_func_indexes, luts_array_funcs,
+        allocate_gpu_memory);
  };
  void release(CudaStreams streams) {

@@ -1954,8 +2048,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    luts_array_second_step->generate_and_broadcast_lut(
        active_streams, lut_ids, lut_funcs, gpu_memory_allocated);

-    // luts_array_second_step->broadcast_lut(active_streams);
-
    if (use_sequential_algorithm_to_resolve_group_carries) {

      seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
@@ -1968,6 +2060,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
          size_tracker);
    }

+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_second_lut_indexes);
  };

@@ -2072,7 +2165,7 @@ template <typename Torus> struct int_sc_prop_memory {
    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
-    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+    num_groups = CEIL_DIV(num_radix_blocks, grouping_size);

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
@@ -2148,6 +2241,7 @@ template <typename Torus> struct int_sc_prop_memory {
        streams, params, num_luts_message_extract, num_radix_blocks + 1,
        allocate_gpu_memory, size_tracker);
    // lut for the first block in the first grouping
+    // this LUT is used on slot 0 for all values of outputFlag
    auto f_message_extract = [message_modulus](Torus block) -> Torus {
      return (block >> 1) % message_modulus;
    };
@@ -2159,7 +2253,15 @@ template <typename Torus> struct int_sc_prop_memory {
    // It seems that this lut could be apply together with the other one but for
    // now we won't do it
    switch (requested_flag) {
-    case outputFlag::FLAG_OVERFLOW: { // Overflow case
+    case outputFlag::FLAG_NONE:
+      // In this case a single LUT is generated with the message extract
+      // function
+      lut_message_extract->generate_and_broadcast_lut(
+          active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
+      break;
+    case outputFlag::FLAG_OVERFLOW: {
+      // Overflow case, an additional LUT is generated in additional to the
+      // message extract one
      auto f_overflow_last = [num_radix_blocks,
                              requested_flag_in](Torus block) -> Torus {
        uint32_t position = (num_radix_blocks == 1 &&
@@ -2182,7 +2284,9 @@ template <typename Torus> struct int_sc_prop_memory {
          gpu_memory_allocated);
      break;
    }
-    case outputFlag::FLAG_CARRY: { // Carry case
+    case outputFlag::FLAG_CARRY: {
+      // Carry case, an additional LUT is generated in additional to the message
+      // extract one

      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
                                                    allocate_gpu_memory);
@@ -2197,13 +2301,9 @@ template <typename Torus> struct int_sc_prop_memory {
      break;
    }
    default:
-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
-      break;
+      PANIC("Invalid output flag in int_sc_prop_memory");
    }
-
-    // lut_message_extract->broadcast_lut(active_streams);
-  };
+  }

  void release(CudaStreams streams) {

@@ -2245,10 +2345,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    shifted_blocks_and_borrow_states = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -2272,6 +2369,9 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
        streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
        allocate_gpu_memory, size_tracker);

+    std::vector<std::vector<std::function<Torus(Torus)>>> luts_array_funcs;
+    std::vector<uint32_t> lut_func_indexes;
+
    auto f_shift_block = [message_modulus](Torus block) -> Torus {
      uint64_t overflow_guard = message_modulus;
      uint64_t block_mod = block % message_modulus;
@@ -2288,15 +2388,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
        f_first_block_state, f_shift_block};

-    auto first_block_lut = luts_array_first_step->get_lut(0, 0);
-    auto first_block_lut_degrees = luts_array_first_step->get_degree(0);
-    auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0);
-
-    generate_many_lut_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), first_block_lut,
-        first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
-        gpu_memory_allocated);
+    luts_array_funcs.push_back(f_first_grouping_luts);
+    lut_func_indexes.push_back(0);

    // luts for other blocks of the first grouping
    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
@@ -2313,13 +2406,9 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
      };
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};
-      auto lut = luts_array_first_step->get_lut(0, lut_id);
-      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
-      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
-      generate_many_lut_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
-          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
-          carry_modulus, f_grouping_luts, gpu_memory_allocated);
+
+      luts_array_funcs.push_back(f_grouping_luts);
+      lut_func_indexes.push_back(lut_id);
    }

    // luts for the rest of groupings (except for the last block)
@@ -2339,13 +2428,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};

-      auto lut = luts_array_first_step->get_lut(0, lut_id);
-      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
-      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
-      generate_many_lut_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
-          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
-          carry_modulus, f_grouping_luts, gpu_memory_allocated);
+      luts_array_funcs.push_back(f_grouping_luts);
+      lut_func_indexes.push_back(lut_id);
    }

    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
@@ -2357,19 +2441,11 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {

    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step

-    auto last_block_lut = luts_array_first_step->get_lut(0, lut_id);
-    auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id);
-    auto last_block_lut_max_degree =
-        luts_array_first_step->get_max_degree(lut_id);
-
    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
        f_last_block_state, f_shift_block};

-    generate_many_lut_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), last_block_lut,
-        last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
-        gpu_memory_allocated);
+    luts_array_funcs.push_back(f_last_grouping_luts);
+    lut_func_indexes.push_back(lut_id);

    // Generate the indexes to switch between luts within the pbs
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
@@ -2397,10 +2473,13 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
+
    // Do I need to do something else for the multi-gpu?
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array_first_step->broadcast_lut(active_streams);
+    luts_array_first_step->generate_and_broadcast_many_lut(
+        active_streams, lut_func_indexes, luts_array_funcs,
+        gpu_memory_allocated);
  };

  // needed for the division to update the lut indexes
@@ -2462,7 +2541,6 @@ template <typename Torus> struct int_borrow_prop_memory {
                         uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
-    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
@@ -2472,7 +2550,7 @@ template <typename Torus> struct int_borrow_prop_memory {
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
    group_size = grouping_size;
-    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+    num_groups = CEIL_DIV(num_radix_blocks, grouping_size);

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
@@ -2537,7 +2615,9 @@ template <typename Torus> struct int_borrow_prop_memory {
  void release(CudaStreams streams) {

    shifted_blocks_borrow_state_mem->release(streams);
+    delete shifted_blocks_borrow_state_mem;
    prop_simu_group_carries_mem->release(streams);
+    delete prop_simu_group_carries_mem;
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   overflow_block, gpu_memory_allocated);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -44,8 +44,6 @@ template <typename Torus> struct int_mul_memory {
          active_streams, {0}, {zero_out_predicate_lut_f},
          gpu_memory_allocated);

-      // zero_out_predicate_lut->broadcast_lut(active_streams);
-
      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);

@@ -85,8 +83,6 @@ template <typename Torus> struct int_mul_memory {
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
                                          allocate_gpu_memory, size_tracker);
-    auto lsb_acc = luts_array->get_lut(0, 0);
-    auto msb_acc = luts_array->get_lut(0, 1);

    // define functions for each accumulator
    auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
    uint32_t calculated_active_blocks =
        total_random_bits == 0
            ? 0
-            : (total_random_bits + message_bits_per_block - 1) /
-                  message_bits_per_block;
+            : CEIL_DIV(total_random_bits, message_bits_per_block);
    if (num_blocks_to_process != calculated_active_blocks) {
      PANIC(
          "num_blocks_to_process should be equal to calculated_active_blocks");
@@ -53,6 +52,10 @@ template <typename Torus> struct int_grouped_oprf_memory {

    // Pre-generate all possible LUTs.
    //
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
+    std::vector<uint64_t> lut_degrees;
+
    for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
         ++random_bit) {
      uint64_t p = 1ULL << random_bit;
@@ -70,14 +73,13 @@ template <typename Torus> struct int_grouped_oprf_memory {

      uint64_t degree = 0;
      uint32_t lut_index = random_bit - 1;
-      generate_device_accumulator_no_encoding<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts->get_lut(0, lut_index),
-          degree, params.message_modulus, params.carry_modulus,
-          params.glwe_dimension, params.polynomial_size, lut_f,
-          allocate_gpu_memory);
+
+      lut_funcs.push_back(lut_f);
+      lut_indices.push_back(lut_index);
+
      // In  OPRF the degree is hard set to p - 1 instead of the LUT degree
      degree = p - 1;
-      *luts->get_degree(lut_index) = degree;
+      lut_degrees.push_back(degree);
    }

    // For each block, this loop determines the exact number of bits to generate
@@ -128,7 +130,16 @@ template <typename Torus> struct int_grouped_oprf_memory {
        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
-    luts->broadcast_lut(active_streams);
+
+    // No encoding for these LUTS. Generate LUT also sets LUT degrees to default
+    // values
+    luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
+                                     allocate_gpu_memory, false);
+
+    // OPRF requires custom LUT degrees
+    for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
+      *luts->get_degree(i) = lut_degrees[i];
+    }

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_corrections);
@@ -170,8 +181,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->num_random_input_blocks =
-        (num_input_random_bits + message_bits_per_block - 1) /
-        message_bits_per_block;
+        CEIL_DIV(num_input_random_bits, message_bits_per_block);

    this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
        streams, params, this->num_random_input_blocks, message_bits_per_block,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -113,7 +113,6 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
      else
        return current_bit;
    };
-    ;
    auto active_gpu_count_mux = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -7,7 +7,8 @@
 #include <functional>
 #include <vector>

-const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
+// If we use more than 5 streams the result is incorrect
+const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;

 template <typename Torus> struct int_equality_selectors_buffer {
  int_radix_params params;
@@ -60,18 +61,10 @@ template <typename Torus> struct int_equality_selectors_buffer {
      fns.push_back([i](Torus x) -> Torus { return (x == i); });
    }

-    generate_many_lut_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->comparison_luts->get_lut(0, 0),
-        this->comparison_luts->get_degree(0),
-        this->comparison_luts->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        fns, allocate_gpu_memory);
-
+    this->comparison_luts->generate_and_broadcast_many_lut(
+        active_streams, {0}, {fns}, allocate_gpu_memory);
    fns.clear();

-    this->comparison_luts->broadcast_lut(active_streams);
-
    this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), this->tmp_many_luts_output,
@@ -175,8 +168,7 @@ template <typename Torus> struct int_possible_results_buffer {
    this->lut_stride =
        (ciphertext_modulus / this->max_luts_per_call) * box_size;

-    this->num_lut_accumulators =
-        (total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
+    this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);

    stream_luts =
        new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
@@ -202,15 +194,10 @@ template <typename Torus> struct int_possible_results_buffer {
          fns.push_back([c](Torus x) -> Torus { return (x == 1) * c; });
        }

-        generate_many_lut_device_accumulator<Torus>(
-            streams.stream(0), streams.gpu_index(0), current_lut->get_lut(0, 0),
-            current_lut->get_degree(0), current_lut->get_max_degree(0),
-            params.glwe_dimension, params.polynomial_size,
-            params.message_modulus, params.carry_modulus, fns,
+        current_lut->generate_and_broadcast_many_lut(
+            streams.active_gpu_subset(1, params.pbs_type), {0}, {fns},
            allocate_gpu_memory);

-        current_lut->broadcast_lut(
-            streams.active_gpu_subset(1, params.pbs_type));
        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
      }
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
                                                int8_t **fp_ks_buffer,
                                                bool gpu_memory_allocated);

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count);
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
@@ -0,0 +1,24 @@
+#ifndef KREYVIUM_H
+#define KREYVIUM_H
+
+#include "../integer/integer.h"
+
+extern "C" {
+uint64_t scratch_cuda_kreyvium_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+
+void cuda_kreyvium_generate_keystream_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
+    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
+    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks);
+
+void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium_utilities.h
@@ -0,0 +1,320 @@
+#ifndef KREYVIUM_UTILITIES_H
+#define KREYVIUM_UTILITIES_H
+#include "../integer/integer_utilities.h"
+
+// Kreyvium specific constants
+// The batch size is set to 64 to allow efficient parallel processing of 64
+// steps at once.
+constexpr uint32_t KREYVIUM_BATCH_SIZE = 64;
+
+// In each Kreyvium step, there are exactly 3 non-linear AND operations:
+// 1. (c109 & c108)
+// 2. (a91 & a90)
+// 3. (b82 & b81)
+constexpr uint32_t KREYVIUM_NUM_AND_GATES = 3;
+
+// In each Kreyvium step, there are 4 paths that require a "flush"
+// to noise-cancel and extract the bit:
+// 1. New bit for Register A
+// 2. New bit for Register B
+// 3. New bit for Register C
+// 4. The Output Keystream bit
+constexpr uint32_t KREYVIUM_NUM_FLUSH_PATHS = 4;
+
+/// Struct to hold the LUTs.
+template <typename Torus> struct int_kreyvium_lut_buffers {
+  // Bivariate AND Gate LUT:
+  // AND operation: f(a, b) = (a & 1) & (b & 1).
+  // This is a Bivariate PBS used for the non-linear parts of Kreyvium.
+  int_radix_lut<Torus> *and_lut;
+
+  // Univariate Flush/Identity LUT:
+  // MESSAGE EXTRACTION operation: f(x) = x & 1.
+  // This is a Univariate PBS used to "flush" the state (reset noise/carries).
+  int_radix_lut<Torus> *flush_lut;
+
+  int_kreyvium_lut_buffers(CudaStreams streams, const int_radix_params &params,
+                           bool allocate_gpu_memory, uint32_t num_inputs,
+                           uint64_t &size_tracker) {
+
+    uint32_t and_ops =
+        num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_AND_GATES;
+    uint32_t flush_ops =
+        num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_FLUSH_PATHS;
+
+    this->and_lut = new int_radix_lut<Torus>(streams, params, 1, and_ops,
+                                             allocate_gpu_memory, size_tracker);
+
+    std::function<Torus(Torus, Torus)> and_lambda =
+        [](Torus lhs, Torus rhs) -> Torus { return (lhs & 1) & (rhs & 1); };
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
+    auto active_streams_and =
+        streams.active_gpu_subset(and_ops, params.pbs_type);
+    this->and_lut->broadcast_lut(active_streams_and);
+    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
+
+    this->flush_lut = new int_radix_lut<Torus>(
+        streams, params, 1, flush_ops, allocate_gpu_memory, size_tracker);
+
+    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
+      return x & 1;
+    };
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
+    auto active_streams_flush =
+        streams.active_gpu_subset(flush_ops, params.pbs_type);
+    this->flush_lut->broadcast_lut(active_streams_flush);
+    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    this->and_lut->release(streams);
+    delete this->and_lut;
+    this->and_lut = nullptr;
+
+    this->flush_lut->release(streams);
+    delete this->flush_lut;
+    this->flush_lut = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+/// Struct to hold the Kreyvium internal state and temporary workspaces.
+template <typename Torus> struct int_kreyvium_state_workspaces {
+
+  CudaRadixCiphertextFFI *a_reg;
+  CudaRadixCiphertextFFI *b_reg;
+  CudaRadixCiphertextFFI *c_reg;
+  CudaRadixCiphertextFFI *k_reg;
+  CudaRadixCiphertextFFI *iv_reg;
+
+  // Shift Workspace
+  CudaRadixCiphertextFFI *shift_workspace;
+
+  // Temporary Update Buffers
+  CudaRadixCiphertextFFI *temp_a;
+  CudaRadixCiphertextFFI *temp_b;
+  CudaRadixCiphertextFFI *temp_c;
+
+  CudaRadixCiphertextFFI *packed_and_lhs;
+  CudaRadixCiphertextFFI *packed_and_rhs;
+  CudaRadixCiphertextFFI *packed_and_out;
+
+  // Flush/Cleanup Packing Buffers
+  CudaRadixCiphertextFFI *packed_flush_in;
+  CudaRadixCiphertextFFI *packed_flush_out;
+
+  uint32_t max_batch_blocks;
+  uint32_t k_offset;
+  uint32_t iv_offset;
+
+  int_kreyvium_state_workspaces(CudaStreams streams,
+                                const int_radix_params &params,
+                                bool allocate_gpu_memory, uint32_t num_inputs,
+                                uint64_t &size_tracker) {
+
+    uint32_t batch_blocks = KREYVIUM_BATCH_SIZE * num_inputs;
+    this->max_batch_blocks = batch_blocks;
+    this->k_offset = 0;
+    this->iv_offset = 0;
+
+    this->a_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->b_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->c_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->k_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->k_reg, 128 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->iv_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->iv_reg, 128 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->shift_workspace = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->shift_workspace,
+        128 * num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->temp_a = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_a, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->temp_b = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_b, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->temp_c = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_c, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->packed_and_lhs = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_and_lhs,
+        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_and_rhs = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_and_rhs,
+        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_and_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_and_out,
+        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_flush_in = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
+        KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_flush_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
+        KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams, bool allocate_gpu_memory) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->a_reg, allocate_gpu_memory);
+    delete this->a_reg;
+    this->a_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->b_reg, allocate_gpu_memory);
+    delete this->b_reg;
+    this->b_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->c_reg, allocate_gpu_memory);
+    delete this->c_reg;
+    this->c_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->k_reg, allocate_gpu_memory);
+    delete this->k_reg;
+    this->k_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->iv_reg, allocate_gpu_memory);
+    delete this->iv_reg;
+    this->iv_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->shift_workspace, allocate_gpu_memory);
+    delete this->shift_workspace;
+    this->shift_workspace = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_a, allocate_gpu_memory);
+    delete this->temp_a;
+    this->temp_a = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_b, allocate_gpu_memory);
+    delete this->temp_b;
+    this->temp_b = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_c, allocate_gpu_memory);
+    delete this->temp_c;
+    this->temp_c = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_and_lhs, allocate_gpu_memory);
+    delete this->packed_and_lhs;
+    this->packed_and_lhs = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_and_rhs, allocate_gpu_memory);
+    delete this->packed_and_rhs;
+    this->packed_and_rhs = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_and_out, allocate_gpu_memory);
+    delete this->packed_and_out;
+    this->packed_and_out = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_flush_in, allocate_gpu_memory);
+    delete this->packed_flush_in;
+    this->packed_flush_in = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_flush_out, allocate_gpu_memory);
+    delete this->packed_flush_out;
+    this->packed_flush_out = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_kreyvium_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_kreyvium_lut_buffers<Torus> *luts;
+  int_kreyvium_state_workspaces<Torus> *state;
+
+  int_kreyvium_buffer(CudaStreams streams, const int_radix_params &params,
+                      bool allocate_gpu_memory, uint32_t num_inputs,
+                      uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    this->luts = new int_kreyvium_lut_buffers<Torus>(
+        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
+
+    this->state = new int_kreyvium_state_workspaces<Torus>(
+        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    luts->release(streams);
+    delete luts;
+    luts = nullptr;
+
+    state->release(streams, allocate_gpu_memory);
+    delete state;
+    state = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
    uint32_t casting_output_dimension, uint32_t casting_ks_level,
    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
-    const bool *is_boolean_array, uint32_t num_compact_lists,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
+    uint32_t num_compact_lists, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
-                const bool *is_boolean_array, uint32_t num_compact_lists,
+                const bool *is_boolean_array,
+                const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
@@ -236,14 +237,36 @@ template <typename Torus> struct zk_expand_mem {
      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
        auto lwe_index = i + num_packed_msgs * offset;
        auto lwe_index_in_list = i % num_lwes_in_kth;
+        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       lwe_index, num_packed_msgs * num_lwes);
        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
        h_indexes_out[lwe_index] =
            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        // If the input relates to a boolean, shift the LUT so the correct one
-        // with sanitization is used
+        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
+        // is_boolean_array tells us which input is a boolean and thus the
+        // related output needs boolean sanitization. It naturally has
+        // total_blocks entries, but h_indexes_out reaches
+        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
+        // the ceiling causes out-of-bounds access. Reading garbage "true" would
+        // set h_lut_indexes to an invalid index pointing to uninitialized
+        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
+        // to match.
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                       "Cuda error: index %d for is_boolean_array is out of "
+                       "bounds (len is %d)",
+                       h_indexes_out[lwe_index], is_boolean_array_len);
        auto boolean_offset =
            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+        PANIC_IF_FALSE(
+            h_lut_indexes[lwe_index] < 4,
+            "Cuda error: lut index is greater than the max possible value (3)");
      }
      offset += num_lwes_in_kth;
    }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
      base_log, level_count, num_lwes);
 }

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count) {
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count) {
  host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
                                  static_cast<const uint64_t *>(input),
                                  static_cast<uint64_t *>(output), base_log,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -10,7 +10,6 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -12,12 +12,9 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 // Finish the keyswitching operation and prepare GLWEs for accumulation.
 // 1. Finish the keyswitching computation partially performed with a GEMM:
 //  - negate the dot product between the GLWE and KSK polynomial
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -6,7 +6,7 @@
 #include "helper_multi_gpu.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <limits>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,4 +1,5 @@
 #include "device.h"
+#include "utils/helper.cuh"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <mutex>
@@ -6,6 +7,27 @@
 #include <cuda_profiler_api.h>
 #endif

+void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+}
+
+int validate_device_ptr(const void *ptr) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+  return attr.device;
+}
+
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -30,8 +52,9 @@ bool mem_pools_enabled = false;
 // better results.
 void cuda_setup_mempool(uint32_t caller_gpu_index) {
  if (!mem_pools_enabled) {
-    pool_mutex.lock();
-    if (mem_pools_enabled)
+    std::lock_guard lock(pool_mutex);
+    if (mem_pools_enabled) // double-check - mem_pools_enabled might have been
+                           // changed in a different thread
      return; // If mem pools are already enabled, we don't need to do anything

    // We do it only once for all GPUs
@@ -78,7 +101,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
    }
    // We return to the original gpu_index
    cuda_set_device(caller_gpu_index);
-    pool_mutex.unlock();
  }
 }

@@ -234,60 +256,61 @@ bool cuda_check_support_thread_block_clusters() {
 #endif
 }

-/// Copy memory to the GPU asynchronously
+/// Copy memory from the CPU to a GPU with size tracking.
+/// This copy is asynchronous only if the CPU memory was pinned, i.e.
+/// allocated using cudaMallocHost. This was shown to come with a performance
+/// penalty if we allocate all CPU data in this way in the backend, so
+/// cudaMallocHost is only used in specific places where we need an
+/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
+/// example to copy the bootstrapping key).
+/// The copy only happens if gpu_memory_allocated is true.
 void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 uint64_t size,
                                                 cudaStream_t stream,
                                                 uint32_t gpu_index,
                                                 bool gpu_memory_allocated) {
+
+  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
+
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }

-/// Copy memory to the GPU asynchronously
+/// Copy memory from the CPU to a GPU.
+/// This copy is asynchronous only if the CPU memory was pinned, i.e.
+/// allocated using cudaMallocHost. This was shown to come with a performance
+/// penalty if we allocate all CPU data in this way in the backend, so
+/// cudaMallocHost is only used in specific places where we need an
+/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
+/// example to copy the bootstrapping key).
 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
  cuda_memcpy_with_size_tracking_async_to_gpu(dest, src, size, stream,
                                              gpu_index, true);
 }

-/// Copy memory within a GPU asynchronously
+/// Copy memory within a GPU asynchronously.
+/// The copy only happens if gpu_memory_allocated is true
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
    uint32_t gpu_index, bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  GPU_ASSERT(dest != nullptr,
-             "Cuda error: trying to copy gpu->gpu to null ptr");
-  GPU_ASSERT(src != nullptr,
-             "Cuda error: trying to copy gpu->gpu from null ptr");

-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  PANIC_IF_FALSE(
-      attr_dest.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  PANIC_IF_FALSE(
-      attr_src.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
+  int src_gpu_index = validate_device_ptr(src);
+  int dest_gpu_index = validate_device_ptr(dest);
  cuda_set_device(gpu_index);
-  if (attr_src.device == attr_dest.device) {
+  if (src_gpu_index == dest_gpu_index) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
  } else {
-    check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
-                                         attr_src.device, size, stream));
+    check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
+                                         src_gpu_index, size, stream));
  }
 }
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -327,21 +350,20 @@ void cuda_synchronize_device(uint32_t gpu_index) {
  check_cuda_error(cudaDeviceSynchronize());
 }

+/// cuda_memset sets bytes, we basically only use it to initialize data to 0
+/// The memset only happens if gpu_memory_allocated is true
 void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          uint64_t size, cudaStream_t stream,
                                          uint32_t gpu_index,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

+/// cuda_memset sets bytes, we basically only use it to initialize data to 0
 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index) {
  cuda_memset_with_size_tracking_async(dest, val, size, stream, gpu_index,
@@ -366,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    }
    cuda_set_device(gpu_index);
    int block_size = 256;
-    int num_blocks = (n + block_size - 1) / block_size;
+    int num_blocks = CEIL_DIV(n, block_size);

    // Launch the kernel
    cuda_set_value_kernel<Torus>
@@ -384,15 +406,15 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                                   uint32_t n);

 /// Copy memory to the CPU asynchronously
+/// This comes with a big penalty on performance even if the CPU
+/// memory is pinned (using cudaMallocHost for the CPU allocation),
+/// so it should be avoided at all costs
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
+  GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
  if (size == 0)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
-  }
+  validate_device_ptr_and_gpu_index(src, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -68,9 +68,15 @@ struct alignas(16) f128 {
    auto t = two_sum(a.lo, b.lo);

    double hi = s.hi;
+#ifdef __CUDA_ARCH__
+    double lo = __dadd_rn(s.lo, t.hi);
+    hi = __dadd_rn(hi, lo);
+    lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
+#else
    double lo = s.lo + t.hi;
    hi = hi + lo;
    lo = lo - (hi - s.hi);
+#endif

    return f128(hi, lo + t.lo);
  }
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
  __host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
    auto s = two_diff(a.hi, b.hi);
    auto t = two_diff(a.lo, b.lo);
+#ifdef __CUDA_ARCH__
+    s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
+    return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
+#else
    s = quick_two_sum(s.hi, s.lo + t.hi);
    return quick_two_sum(s.hi, s.lo + t.lo);
+#endif
  }

  // Multiplication
@@ -220,16 +231,16 @@ struct f128x2 {
  // Subtraction
  __host__ __device__ friend f128x2 operator-(const f128x2 &a,
                                              const f128x2 &b) {
-    return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
-                  f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
+    return f128x2(f128::sub_estimate(a.re, b.re),
+                  f128::sub_estimate(a.im, b.im));
  }

  // Multiplication (complex multiplication)
  __host__ __device__ friend f128x2 operator*(const f128x2 &a,
                                              const f128x2 &b) {
+    const f128 a_im_b_im = f128::mul(a.im, b.im);
    f128 real_part =
-        f128::add(f128::mul(a.re, b.re),
-                  f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
+        f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
    f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
    return f128x2(real_part, imag_part);
  }
@@ -243,8 +254,8 @@ struct f128x2 {

  // Subtraction-assignment operator
  __host__ __device__ f128x2 &operator-=(const f128x2 &other) {
-    re = f128::add(re, f128(-other.re.hi, -other.re.lo));
-    im = f128::add(im, f128(-other.im.hi, -other.im.lo));
+    re = f128::sub_estimate(re, other.re);
+    im = f128::sub_estimate(im, other.im);
    return *this;
  }

@@ -261,12 +272,20 @@ struct f128x2 {
 };

 __host__ __device__ inline uint64_t double_to_bits(double d) {
+#ifdef __CUDA_ARCH__
+  uint64_t bits = __double_as_longlong(d);
+#else
  uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
+#endif
  return bits;
 }

 __host__ __device__ inline double bits_to_double(uint64_t bits) {
+#ifdef __CUDA_ARCH__
+  double d = __longlong_as_double(bits);
+#else
  double d = *reinterpret_cast<double *>(&bits);
+#endif
  return d;
 }

@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const double A = ONE << 52;
  const double B = ONE << 104;
  const double C = ONE << 76;
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double D = 340282366920938500000000000000000000000.;

  const __uint128_t threshold = (ONE << 104);
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = A_bits | lower64;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - A;

    uint64_t B_bits = double_to_bits(B);
    uint64_t top64 = static_cast<uint64_t>(x >> 52);
    uint64_t bits_h = B_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
+#else
+    double l = l_temp - A;
    double h = h_temp - B;

    return (l + h);
+#endif

  } else {
    uint64_t C_bits = double_to_bits(C);
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = C_bits | lower64 | mask_part;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - C;

    uint64_t D_bits = double_to_bits(D);
    uint64_t top64 = static_cast<uint64_t>(x >> 76);
    uint64_t bits_h = D_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
+#else
+    double l = l_temp - C;
    double h = h_temp - D;

    return (l + h);
+#endif
  }
 }

@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {

 __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
  x.hi = __dmul_rn(x.hi, normalization);
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  x.lo *= normalization;
 #endif

-  // TODO has to be round
+  x = f128::add_estimate(x, f128(0.5, 0.0));
  x = f128::f128_floor(x);

  __uint128_t x0 = f64_to_u128(x.hi);
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -12,8 +12,9 @@
 using Index = unsigned;

 #define NEG_TWID(i)                                                            \
-  f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]),               \
-         f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
+  f128x2(                                                                      \
+      f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])),  \
+      f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))

 #define F64x4_TO_F128x2(f128x2_reg, ind)                                       \
  f128x2_reg.re.hi = dt_re_hi[ind];                                            \
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }
      tid = tid + STRIDE;
    }
    __syncthreads();
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);
-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }
      w = NEG_TWID(tid / lane_mask + twiddle_shift);
      f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
      f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      // keep one of the register for next iteration and store another one in sm
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }

      tid = tid + STRIDE;
    }
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);

-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }

      tid = tid + STRIDE;
    }
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re, const __uint128_t *in_im) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -12,7 +12,7 @@
 #include "integer/subtraction.cuh"
 #include "pbs/programmable_bootstrap_classic.cuh"
 #include "pbs/programmable_bootstrap_multibit.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"

 // lwe_dimension + 1 threads
 // todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(

  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -134,13 +134,6 @@ __host__ void are_all_comparisons_block_true(
        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
          return x == chunk_length;
        };
-        generate_device_accumulator_with_cpu_prealloc<Torus>(
-            streams.stream(0), streams.gpu_index(0),
-            is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
-            is_max_value_lut->get_max_degree(1), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
-            is_equal_to_num_blocks_lut_f, true,
-            are_all_block_true_buffer->preallocated_h_lut);

        Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
        for (int index = 0; index < num_chunks; index++) {
@@ -155,7 +148,10 @@ __host__ void are_all_comparisons_block_true(
                                 streams.stream(0), streams.gpu_index(0));
        auto active_streams =
            streams.active_gpu_subset(num_chunks, params.pbs_type);
-        is_max_value_lut->broadcast_lut(active_streams);
+
+        is_max_value_lut->generate_and_broadcast_lut(
+            active_streams, {1}, {is_equal_to_num_blocks_lut_f}, true, true,
+            {are_all_block_true_buffer->preallocated_h_lut});
      }
      lut = is_max_value_lut;
    }
@@ -222,7 +218,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -483,14 +479,10 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    y = x;
    f = sign_handler_f;
  }
-  generate_device_accumulator_with_cpu_prealloc<Torus>(
-      streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
-      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
-      polynomial_size, message_modulus, carry_modulus, f, true,
-      tree_buffer->preallocated_h_lut);

  auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-  last_lut->broadcast_lut(active_streams);
+  last_lut->generate_and_broadcast_lut(active_streams, {0}, {f}, true, true,
+                                       {tree_buffer->preallocated_h_lut});

  // Last leaf
  integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -10,7 +10,122 @@
 #include "integer/integer.cuh"
 #include "linearalgebra/multiplication.cuh"
 #include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
+
+/*
+ * =============================================================================
+ * GPU Compression/Decompression Algorithm: Overview
+ * =============================================================================
+ *
+ * The compression algorithm transforms standard LWE ciphertexts into a compact
+ * packed format. Decompression reverses this process.
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION INPUT (lwe_array_in)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                    lwe_array_in (GPU memory)                            |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------+---------------------------+-----------------+
+ *  |          LWE 0            |          LWE 1            |      ...        |
+ *  |      [mask, body]         |      [mask, body]         |                 |
+ *  +---------------------------+---------------------------+-----------------+
+ *  |<-- lwe_dimension + 1 -->|
+ *
+ *  Total LWEs: total_lwe_bodies_count (num_radix_blocks)
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION PROCESS
+ * -----------------------------------------------------------------------------
+ *
+ * 1. Message Shift (64-bit only):
+ *    Each LWE is multiplied by message_modulus to shift the message to MSB
+ *
+ * 2. Packing Keyswitch (LWE -> GLWE):
+ *    Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
+ *
+ *    +--------------------------------------------------------------+
+ *    |   lwe_per_glwe LWEs (input batch)                            |
+ *    |   LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1]                   |
+ *    +--------------------------------------------------------------+
+ *                              |
+ *                    Packing Keyswitch
+ *                              v
+ *    +--------------------------------------------------------------+
+ *    |            Single GLWE Ciphertext                            |
+ *    |   [A_0, A_1, ..., A_{k-1}, B]                                |
+ *    |   |<-- k * polynomial_size -->| |<-- polynomial_size -->|   |
+ *    +--------------------------------------------------------------+
+ *
+ *    Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
+ *                                             lwe_per_glwe)
+ *
+ * 3. Modulus Switch:
+ *    Reduce precision from 64-bit torus to storage_log_modulus bits
+ *
+ * 4. Bit Packing:
+ *    Pack multiple reduced-precision elements into dense bit representation
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                 tmp_glwe_array_out (intermediate buffer)                |
+ *  +-------------------------------------------------------------------------+
+ *  +----------------------------+----------------------------+---------------+
+ *  |         GLWE 0             |         GLWE 1             |    ...        |
+ *  |  [A_0..A_{k-1}, B_0..B_N]  |  [A_0..A_{k-1}, B_0..B_N]  |               |
+ *  +----------------------------+----------------------------+---------------+
+ *       |<-- glwe_accumulator_size = (k+1)*N -->|
+ *
+ *  Total size needed: num_glwes * glwe_accumulator_size elements
+ *  Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
+ *
+ * -----------------------------------------------------------------------------
+ * PACKED OUTPUT (glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |              Packed GLWE Ciphertext List (bit-packed)                   |
+ *  +-------------------------------------------------------------------------+
+ *  +-------------------------------------------------------------------------+
+ *  |  Elements packed with storage_log_modulus bits per original element    |
+ *  |  Total packed size: ceil(in_len * storage_log_modulus / 64) elements   |
+ *  +-------------------------------------------------------------------------+
+ *
+ * =============================================================================
+ * DECOMPRESSION (Extract) Algorithm
+ * =============================================================================
+ *
+ * Decompression receives an array of LWE indexes. For each index, it identifies
+ * the corresponding GLWE, extracts that GLWE from the packed representation,
+ * and then sample-extracts the requested LWE from the GLWE.
+ *
+ * -----------------------------------------------------------------------------
+ * EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |               Extracted GLWE Ciphertext                                 |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------------------+-----------------+---------------+
+ *  |    Mask (A polynomials)               |   Body (B)      |    Tail       |
+ *  |    [A_0, ..., A_{k-1}]                |   (body_count)  |   (zeroed)    |
+ *  |    k * polynomial_size elements       |   elements      |   elements    |
+ *  +---------------------------------------+-----------------+---------------+
+ *  |<------------------- initial_out_len ------------------->|               |
+ *  |<------------------------ glwe_ciphertext_size ------------------------->|
+ *
+ *  For the last GLWE, body_count may be less than polynomial_size (partial).
+ *  The tail region must be zeroed to ensure defined behavior.
+ *
+ *  tail_size = glwe_ciphertext_size - initial_out_len
+ *  tail_offset = initial_out_len  (NOT 0!)
+ *
+ * =============================================================================
+ */

 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,

  // number_bits_to_pack.div_ceil(Scalar::BITS)
  auto nbits = sizeof(Torus) * 8;
-  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
+  auto out_len = CEIL_DIV(number_bits_to_pack, nbits);

  int num_blocks = 0, num_threads = 0;
  getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -91,6 +206,10 @@ host_integer_compress(CudaStreams streams,

  auto compression_params = mem_ptr->compression_params;

+  PANIC_IF_FALSE(glwe_array_out->lwe_per_glwe == mem_ptr->lwe_per_glwe,
+                 "lwe_per_glwe mismatch between scratch allocation and host "
+                 "function input");
+
  // Shift
  auto lwe_pksk_input = (Torus *)lwe_array_in->ptr;

@@ -108,6 +227,8 @@ host_integer_compress(CudaStreams streams,
  uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
                        glwe_array_out->lwe_per_glwe - 1) /
                       glwe_array_out->lwe_per_glwe;
+  PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
+                 "Invalid number of GLWEs");

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -136,9 +257,9 @@ host_integer_compress(CudaStreams streams,
  }

  // Modulus switch
-  int size = num_glwes * compression_params.glwe_dimension *
-                 compression_params.polynomial_size +
-             glwe_array_out->total_lwe_bodies_count;
+  uint32_t size = num_glwes * compression_params.glwe_dimension *
+                      compression_params.polynomial_size +
+                  glwe_array_out->total_lwe_bodies_count;

  host_modulus_switch_inplace<Torus>(streams.stream(0), streams.gpu_index(0),
                                     tmp_glwe_array_out, size,
@@ -200,8 +321,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;

-  uint32_t num_glwes =
-      (total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
+  uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);

  // Compressed length of the compressed GLWE we want to extract
  uint32_t body_count = 0;
@@ -218,19 +338,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;

-  // Calculates how many bits this particular GLWE shall use
-  auto number_bits_to_unpack = initial_out_len * log_modulus;
  auto nbits = sizeof(Torus) * 8;

-  // Calculates how many bits a full-packed GLWE shall use
-  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
-  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Calculate how many bits a full-packed GLWE uses, to determine
+  // the stride between consecutive packed GLWEs in the input buffer
+  auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = CEIL_DIV(number_bits_to_unpack, nbits);
  // Uses that length to set the input pointer
  auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;

  // Ensure the tail of the GLWE is zeroed
+  // The extract kernel writes initial_out_len elements starting at offset 0.
+  // We must zero the tail region (from initial_out_len to
+  // glwe_ciphertext_size)
  if (initial_out_len < glwe_ciphertext_size) {
-    cuda_memset_async(glwe_array_out, 0,
+    cuda_memset_async(glwe_array_out + initial_out_len, 0,
                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
                      stream, gpu_index);
  }
@@ -263,7 +385,12 @@ host_integer_decompress(CudaStreams streams,
                           streams.stream(0), streams.gpu_index(0));

  auto compression_params = h_mem_ptr->compression_params;
-  auto lwe_per_glwe = compression_params.polynomial_size;
+
+  // Use the lwe_per_glwe value from the packed GLWE metadata.
+  // This value was set during compression and may differ from polynomial_size.
+  // For example, noise squashing compression uses lwe_per_glwe=128 with
+  // polynomial_size=1024.
+  auto lwe_per_glwe = d_packed_glwe_in->lwe_per_glwe;

  // the first element is the number of LWEs that lies in the related GLWE
  std::vector<std::pair<int, Torus *>> glwe_vec;
@@ -368,7 +495,7 @@ host_integer_decompress(CudaStreams streams,
      /// gather data to GPU 0 we can copy back to the original indexing
      multi_gpu_scatter_lwe_async<Torus>(
          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
-          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
          lut->active_streams.count(), num_blocks_to_decompress,
          compression_params.small_lwe_dimension + 1);

@@ -388,7 +515,7 @@ host_integer_decompress(CudaStreams streams,
      multi_gpu_gather_lwe_async<Torus>(
          active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
          lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-          lut->lwe_aligned_vec, num_blocks_to_decompress,
+          lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_decompress,
          encryption_params.big_lwe_dimension + 1);

      /// Synchronize all GPUs
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -14,7 +14,6 @@
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <algorithm>
 #include <functional>

@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
      }
    }

-    if ((blockIdx.x / group_size + 1) <
-        (blocks_count + group_size - 1) / group_size) {
+    if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
      num_radix_blocks > src1->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks should have more "
          "blocks than the number used in sum in groups")
-  auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
+  auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
  if (src2->num_radix_blocks < num_groups)
    PANIC("Cuda error: second input in sum in groups should have at least "
          "num_groups blocks")
@@ -570,8 +568,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -594,7 +592,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)lwe_array_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
+        big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
        active_streams);
@@ -674,8 +673,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -791,8 +790,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -815,7 +814,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)(lwe_array_out->ptr), lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
+        big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
        active_streams);
@@ -960,8 +960,9 @@ uint64_t generate_many_lookup_table(
 template <typename Torus>
 void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
                                       uint32_t polynomial_size,
-                                       std::function<Torus(uint32_t)> f) {
+                                       std::function<Torus(Torus)> f) {

+  // accumulator number of elements is (glwe_dimension + 1) * polynomial_size
  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];
@@ -973,9 +974,9 @@ void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,

 template <typename Torus>
 void generate_device_accumulator_no_encoding(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t &degree,
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
+    uint32_t polynomial_size, std::function<Torus(Torus)> f,
    bool gpu_memory_allocated) {

  Torus *h_lut =
@@ -984,7 +985,7 @@ void generate_device_accumulator_no_encoding(
  generate_lookup_table_no_encoding<Torus>(h_lut, glwe_dimension,
                                           polynomial_size, f);

-  degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
+  *degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;

  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -1067,85 +1068,6 @@ void generate_device_accumulator_bivariate(
  POP_RANGE()
 }

-template <typename Torus> struct int_lut_cache {
-  int_lut_cache() {}
-
-  Torus *get_cached_univariate_lut(std::function<Torus(Torus)> &f, uint64_t *degree,
-                        uint64_t *max_degree, uint32_t glwe_dimension,
-                        uint32_t polynomial_size,
-                        uint32_t input_message_modulus,
-                        uint32_t input_carry_modulus,
-                        uint32_t output_message_modulus,
-                        uint32_t output_carry_modulus) {
-    /*__int128_t f_hash = 0;
-    uint32_t bits_per_lut_val = 5;
-    uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
-    for (uint32_t i = 0; i < input_modulus_sup; ++i) {
-      Torus f_eval = f(i);
-      GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
-                 "LUT value expected bitwidth overflow");
-      f_hash |= f_eval;
-      f_hash <<= bits_per_lut_val;
-    }
-
-    std::lock_guard cache_lock(_mutex);
-    if (_lut_cache.find(f_hash) != _lut_cache.end()) {
-      lut_ptr &ptr = _lut_cache[f_hash];
-      GPU_ASSERT(ptr.output_message_modulus == output_message_modulus,
-                 "Error modulus");
-      GPU_ASSERT(ptr.input_message_modulus == input_message_modulus,
-                 "Error modulus");
-      GPU_ASSERT(ptr.glwe_dimension == glwe_dimension, "Error modulus");
-      *max_degree = ptr.max_degree;
-      *degree = ptr.degree;
-      return ptr.ptr;
-    }*/
-
-    // host lut
-    Torus *h_lut =
-        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-    *max_degree = input_message_modulus * input_carry_modulus - 1;
-    *degree = generate_lookup_table_with_encoding<Torus>(
-        h_lut, glwe_dimension, polynomial_size, input_message_modulus,
-        input_carry_modulus, output_message_modulus, output_carry_modulus, f);
-
-    /*lut_ptr new_ptr = {h_lut,
-                       glwe_dimension,
-                       input_message_modulus,
-                       input_carry_modulus,
-                       output_message_modulus,
-                       output_carry_modulus,
-                       *max_degree,
-                       *degree};*/
-    //_lut_cache[f_hash] = new_ptr;
-    return h_lut;
-  }
-
-  ~int_lut_cache() {
-    std::lock_guard cache_lock(_mutex);
-    for (auto v : _lut_cache) {
-      free(v.second.ptr);
-    }
-    _lut_cache.clear();
-  }
-
-private:
-  struct lut_ptr {
-    Torus *ptr;
-    uint32_t glwe_dimension;
-    uint32_t input_message_modulus;
-    uint32_t input_carry_modulus;
-    uint32_t output_message_modulus;
-    uint32_t output_carry_modulus;
-    uint64_t max_degree;
-    uint64_t degree;
-  };
-  std::map<__int128_t, lut_ptr> _lut_cache;
-  std::mutex _mutex;
-};
-static int_lut_cache<uint64_t> g_LutCache64;
-
 /*
 *  generate bivariate accumulator with factor scaling for device pointer
 *    v_stream - cuda stream
@@ -1177,8 +1099,8 @@ void generate_device_accumulator_bivariate_with_factor(
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
      gpu_memory_allocated);

-//  cuda_synchronize_stream(stream, gpu_index);
-//  free(h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
 /*
 *  generate bivariate accumulator for device pointer
@@ -1224,36 +1146,23 @@ void generate_device_accumulator_with_encoding(
    uint32_t output_message_modulus, uint32_t output_carry_modulus,
    std::function<Torus(Torus)> f, bool gpu_memory_allocated) {

-  static constexpr auto is_u64 = std::is_same_v<Torus, uint64_t>;
-  Torus *h_lut = nullptr;
  // host lut
-  if constexpr (is_u64) {
-    h_lut = g_LutCache64.get_cached_univariate_lut(
-        f, degree, max_degree, glwe_dimension, polynomial_size,
-        input_message_modulus, input_carry_modulus, output_message_modulus,
-        output_carry_modulus);
-  } else {
-    h_lut =
-        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  *max_degree = input_message_modulus * input_carry_modulus - 1;
+  // fill accumulator
+  *degree = generate_lookup_table_with_encoding<Torus>(
+      h_lut, glwe_dimension, polynomial_size, input_message_modulus,
+      input_carry_modulus, output_message_modulus, output_carry_modulus, f);

-    *max_degree = input_message_modulus * input_carry_modulus - 1;
-    // fill accumulator
-    *degree = generate_lookup_table_with_encoding<Torus>(
-        h_lut, glwe_dimension, polynomial_size, input_message_modulus,
-        input_carry_modulus, output_message_modulus, output_carry_modulus, f);
-  }
-/*
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-*/
-  if (!std::is_same_v<Torus, uint64_t>) {
-    cuda_synchronize_stream(stream, gpu_index);
-    free(h_lut);
-  }
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
-
 template <typename Torus>
 void generate_device_accumulator_with_encoding_with_cpu_prealloc(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
@@ -1356,8 +1265,8 @@ void generate_many_lut_device_accumulator(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);

-  //cuda_synchronize_stream(stream, gpu_index);
-  //free(h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
  POP_RANGE()
 }

@@ -1828,12 +1737,9 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
      signs_array_in, 0, num_sign_blocks);
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
-        reduce_two_orderings_function, true, diff_buffer->preallocated_h_lut1);
-    lut->broadcast_lut(lut->active_streams);
+    lut->generate_and_broadcast_lut(lut->active_streams, {0},
+                                    {reduce_two_orderings_function}, true, true,
+                                    {diff_buffer->preallocated_h_lut1});

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
@@ -1859,12 +1765,10 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
-        diff_buffer->preallocated_h_lut2);
-    lut->broadcast_lut(lut->active_streams);
+
+    lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
+                                    true, true,
+                                    {diff_buffer->preallocated_h_lut2});

    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                       signs_a, num_sign_blocks, message_modulus);
@@ -1879,12 +1783,9 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
-        diff_buffer->preallocated_h_lut2);
-    lut->broadcast_lut(lut->active_streams);
+    lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
+                                    true, true,
+                                    {diff_buffer->preallocated_h_lut2});

    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
@@ -2434,7 +2335,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,

  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
  // from the output ct
-  auto active_streams = streams.active_gpu_subset(
+  auto active_streams = streams.active_gpu_subset_u128(
      lwe_array_out->num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
@@ -2465,8 +2366,9 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
    multi_gpu_scatter_lwe_async<InputTorus>(
        active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
-        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
+        lut->lwe_aligned_scatter_vec, lut->event_pool,
+        lut->active_streams.count(), lwe_array_out->num_radix_blocks,
+        lut->input_big_lwe_dimension + 1);

    execute_keyswitch_async<InputTorus>(
        active_streams, lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -2489,7 +2391,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
    multi_gpu_gather_lwe_async<__uint128_t>(
        active_streams, (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
        nullptr, lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
-        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+        lut->event_pool, lwe_array_out->num_radix_blocks,
+        big_lwe_dimension + 1);

    /// Synchronize all GPUs
    streams.synchronize();
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -362,7 +362,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
  radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
                                num_radix_in_vec, chunk_size, needs_processing);
  int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
-  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
+  int part_count = CEIL_DIV(big_lwe_size, number_of_threads);
  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

  mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -10,7 +10,7 @@
 #include "device.h"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <iostream>
 #include <sstream>
 #include <string>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -54,7 +54,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
        active_streams.count(), num_blocks_to_process,
        mem_ptr->params.small_lwe_dimension + 1);
    POP_RANGE()
@@ -72,7 +72,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_blocks_to_process,
+        lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_process,
        mem_ptr->params.big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
--- a/Show More
+++ b/Show More