chore(gpu): 32-bit zk

ZK_CUDA_LIMB_BITS=32 make test_integer_zk_experimental_gpu
chore(gpu): add new benchmark target for zk-cuda-backend accelerated functions
2026-04-28 03:01:21 -04:00 · 2026-03-03 14:52:35 -03:00 · 2026-03-02 16:03:50 -03:00 · 2026-03-02 16:03:49 -03:00 · 2026-03-02 16:03:48 -03:00 · 2026-03-02 16:02:44 -03:00
513 changed files with 60445 additions and 22934 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -68,6 +68,12 @@ runs:
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
        sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
+
+        # Disable unattended-upgrades to avoid lock issues
+        sudo systemctl disable --now unattended-upgrades
+
+        sudo apt-get clean
+        sudo rm -rf /var/lib/apt/lists/*
        sudo apt update
        sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"

--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -0,0 +1,15 @@
+runners:
+  cpu-big:
+    family: m6i.32xlarge
+    image: cpu-tests-eu-west-3
+    volume: 200gb
+    spot: false
+  cpu-small:
+    family: m6i.4xlarge
+    image: cpu-tests-eu-west-3
+    volume: 200gb
+    spot: false
+
+images:
+  cpu-tests-eu-west-3:
+    ami: "ami-0a786ffdb1411fac4"  # Ubuntu 24.04
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -14,9 +14,7 @@ env:
  SLACKIFY_MARKDOWN: true
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -32,41 +30,17 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  setup-instance:
-    name: aws_tfhe_backward_compat_tests/setup-instance
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
  backward-compat-tests:
    name: aws_tfhe_backward_compat_tests/backward-compat-tests (bpr)
-    needs: [ setup-instance ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name != 'push'
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'true' # Needed to pull lfs data
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -109,7 +83,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -131,27 +105,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: aws_tfhe_backward_compat_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, backward-compat-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -15,9 +15,7 @@ env:
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -63,7 +61,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -134,44 +132,18 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

-  setup-instance:
-    name: aws_tfhe_fast_tests/setup-instance
-    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
-    needs: should-run
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
  fast-tests:
    name: Fast CPU tests
-    needs: [ should-run, setup-instance ]
+    needs: should-run
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -219,7 +191,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            ~/.nvm
@@ -232,7 +204,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -289,27 +261,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: aws_tfhe_fast_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, fast-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -17,9 +17,7 @@ env:
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -50,7 +48,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -71,48 +69,22 @@ jobs:
              - tfhe/src/integer/**
              - .github/workflows/aws_tfhe_integer_tests.yml

-  setup-instance:
-    name: aws_tfhe_integer_tests/setup-instance
+  unsigned-integer-tests:
+    name: aws_tfhe_integer_tests/unsigned-integer-tests
    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
      github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
-  unsigned-integer-tests:
-    name: aws_tfhe_integer_tests/unsigned-integer-tests
-    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -158,27 +130,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: aws_tfhe_integer_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [setup-instance, unsigned-integer-tests]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -13,8 +13,7 @@ env:
  SLACKIFY_MARKDOWN: true
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -35,7 +34,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +59,7 @@ jobs:
    timeout-minutes: 1440
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -100,7 +99,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -17,9 +17,7 @@ env:
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -51,7 +49,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -72,47 +70,21 @@ jobs:
              - tfhe/src/integer/**
              - .github/workflows/aws_tfhe_signed_integer_tests.yml

-  setup-instance:
-    name: aws_tfhe_signed_integer_tests/setup-instance
+  signed-integer-tests:
+    name: aws_tfhe_signed_integer_tests/signed-integer-tests
    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
      github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
-  signed-integer-tests:
-    name: aws_tfhe_signed_integer_tests/signed-integer-tests
-    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -162,27 +134,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: aws_tfhe_signed_integer_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [setup-instance, signed-integer-tests]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -14,9 +14,7 @@ env:
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -72,7 +70,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -143,46 +141,18 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

-  setup-instance:
-    name: aws_tfhe_tests/setup-instance
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
-    needs: should-run
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
  cpu-tests:
    name: aws_tfhe_tests/cpu-tests
+    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
-    needs: [ should-run, setup-instance ]
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -269,27 +239,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: aws_tfhe_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, cpu-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -13,9 +13,7 @@ env:
  SLACKIFY_MARKDOWN: true
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -29,42 +27,56 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  setup-instance:
-    name: aws_tfhe_wasm_tests/setup-instance
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+  should-run:
+    name: aws_tfhe_wasm_tests/should-run
+    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
+          steps.changed-files.outputs.wasm_any_changed }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            wasm:
+                - Cargo.toml
+                - tfhe/Cargo.toml
+                - tfhe-csprng/**
+                - tfhe-fft/**
+                - tfhe-zk-pok/**
+                - tfhe/src/core_crypto/**
+                - tfhe/src/shortint/**
+                - tfhe/src/integer/**
+                - tfhe/src/high_level_api/**
+                - tfhe/src/js_on_wasm_api/**
+                - tfhe/js_on_wasm_tests/**
+                - tfhe/web_wasm_parallel_tests/**
+                - utils/tfhe-versionable/**
+                - .github/workflows/aws_tfhe_wasm_tests.yml

  wasm-tests:
    name: aws_tfhe_wasm_tests/wasm-tests
-    needs: setup-instance
+    needs: should-run
+    if: github.event_name == 'workflow_dispatch' ||
+      (contains(github.event.label.name, 'approved') && needs.should-run.outputs.wasm_test == 'true')
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            ~/.nvm
@@ -93,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -137,27 +149,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: aws_tfhe_wasm_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, wasm-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -14,12 +14,15 @@ on:
          - signed_integer
          - integer_compression
          - integer_zk
+          - msm_zk
          - shortint
          - shortint_oprf
-          - hlapi
+          - hlapi_unsigned
+          - hlapi_signed
          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
+          - hlapi_kvstore
          - tfhe_zk_pok
          - boolean
          - pbs
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -24,6 +24,7 @@ permissions: {}
 jobs:
  prepare-inputs:
    name: benchmark_cpu_weekly/prepare-inputs
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    outputs:
      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -72,8 +73,7 @@ jobs:

  run-benchmarks-integer:
    name: benchmark_cpu_weekly/run-benchmarks-integer
-    if: github.repository == 'zama-ai/tfhe-rs' 
-      && (needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -92,8 +92,7 @@ jobs:

  run-benchmarks-integer-zk-pke:
    name: benchmark_cpu_weekly/run-benchmarks-integer-zk-pke
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -111,8 +110,7 @@ jobs:

  run-benchmarks-hlapi-erc20:
    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -130,8 +128,7 @@ jobs:

  run-benchmarks-hlapi-dex:
    name: benchmark_cpu_weekly/run-benchmarks-hlapi-dex
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -149,8 +146,7 @@ jobs:

  run-benchmarks-core-crypto:
    name: benchmark_cpu_weekly/run-benchmarks-core-crypto
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -167,8 +163,7 @@ jobs:

  run-benchmarks-shortint:
    name: benchmark_cpu_weekly/run-benchmarks-shortint
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && (needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -186,8 +181,7 @@ jobs:

  run-benchmarks-boolean:
    name: benchmark_cpu_weekly/run-benchmarks-boolean
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -206,8 +200,7 @@ jobs:

  run-benchmarks-tfhe-zk-pok:
    name: benchmark_cpu_weekly/run-benchmarks-tfhe-zk-pok
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -25,10 +25,6 @@ on:
        description: "Generate SVG tables"
        type: boolean
        default: true
-      open-pr:
-        description: "Open a PR with the benchmark results"
-        type: boolean
-        default: false

 permissions: {}

@@ -40,7 +36,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer
+      command: integer,hlapi_erc20
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -54,6 +50,40 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

+  run-benchmarks-cpu-zk-server:
+    name: benchmark_documentation/run-benchmarks-cpu-zk-server
+    uses: ./.github/workflows/benchmark_cpu_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      command: integer_zk
+      op_flavor: default
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-cpu-zk-client:
+    name: benchmark_documentation/run-benchmarks-cpu-zk-client
+    uses: ./.github/workflows/benchmark_wasm_client_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      browser: chrome
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
  run-benchmarks-gpu-integer:
    name: benchmark_documentation/run-benchmarks-gpu-integer
    uses: ./.github/workflows/benchmark_gpu_common.yml
@@ -61,7 +91,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit
+      command: integer_multi_bit,hlapi_erc20
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -80,7 +110,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer
+      command: integer,hlapi_erc20
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -142,6 +172,7 @@ jobs:
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
+      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
    ]
    uses: ./.github/workflows/generate_svgs.yml
@@ -166,54 +197,3 @@ jobs:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  open-pr:
-    name: benchmark-documentation/open-pr
-    needs: [ generate-svgs-with-benchmarks-run, generate-svgs-without-benchmarks-run ]
-    if: ${{ always() && inputs.open-pr &&
-      (needs.generate-svgs-with-benchmarks-run.result == 'success' || needs.generate-svgs-without-benchmarks-run.result == 'success') }}
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write # Needed to create a commit
-      pull-requests: write # Needed to open a pull-request
-    env:
-      PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
-        with:
-          persist-credentials: 'false'
-
-      - name: Download SVG tables
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
-        with:
-          path: svg_tables
-          merge-multiple: 'true'
-
-      # Perform best effort to copy SVG tables. If the copy fails or files don't exist, the PR will still be created.
-      - name: Copy SVG tables to documentation location
-        run: |
-          cp -f svg_tables/*integer-benchmark*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-          cp -f svg_tables/*pbs-benchmark-tuniform*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-          cp -f svg_tables/cpu-gpu-hpu-integer-benchmark-fheuint64-tuniform-2m128-ciphertext.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-
-      - name: Get current date
-        id: get-date
-        run: |
-          echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
-
-      - name: Create pull-request
-        uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
-        with:
-          sign-commits: true # Commit will be signed by github-actions bot
-          add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
-          branch: gh-bot/docs/update-svg-tables-${{ steps.get-date.outputs.date }}
-          commit-message: |
-            chore(docs): update benchmark results for all backends
-
-            Automated documentation update from tfhe-rs CI pipeline.
-          title: |
-            [CI] chore(docs): update benchmark results for all backends
-          body: |
-            Documentation update triggered by GitHub workflow.
-          labels: documentation
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,6 +31,8 @@ on:
          - pbs128
          - ks
          - ks_pbs
+          - tfhe_zk_pok
+          - msm_zk
          - integer_zk
          - integer_aes
          - integer_aes256
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -123,7 +123,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -50,6 +50,8 @@ env:
 jobs:
  parse-inputs:
    name: benchmark_gpu_coprocessor/parse-inputs
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      contents: 'read'
@@ -92,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +132,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -141,7 +143,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -192,10 +194,10 @@ jobs:
          cargo install sqlx-cli

      - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
+        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
+        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
        with:
          path: |
            ~/.cargo/registry
@@ -205,14 +207,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -299,7 +301,7 @@ jobs:
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +326,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -1,289 +1,28 @@
-# Run CUDA benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+# Run CUDA benchmarks on Hyperstack VM and return parsed results to Slab CI bot.
 name: benchmark_gpu_weekly

+run-name: GPU weekly benchmarks
+
 on:
  schedule:
-    # Weekly schedules are separated in several groups to avoid spawning too many the machines at once thus risking resource shortages.
-    # Group 1
-    # -------
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
    - cron: '0 1 * * 6'
-    # Group 2
-    # -------
-    # Weekly benchmarks will be triggered each Sunday at 1a.m.
-    - cron: '0 1 * * 0'
-    # Group 3
-    # -------
-    # Weekly benchmarks will be triggered each Sunday at 9p.m.
-    - cron: '0 9 * * 0'
-

 permissions: {}

 # zizmor: ignore[concurrency-limits] only GitHub can trigger this workflow

 jobs:
-  prepare-inputs:
-    name: benchmark_cpu_weekly/prepare-inputs
-    runs-on: ubuntu-latest
-    outputs:
-      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
-      is_weekly_bench_group_2: ${{ steps.check_bench_group_2.outputs.is_weekly_bench_group_2 }}
-      is_weekly_bench_group_3: ${{ steps.check_bench_group_3.outputs.is_weekly_bench_group_3 }}
-    steps:
-      - name: Check is weekly bench group 1
-        id: check_bench_group_1
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "is_weekly_bench_group_1=${{ github.event.schedule == '0 1 * * 6' }}" >> "${GITHUB_OUTPUT}"
-
-      - name: Check is weekly bench group 2
-        id: check_bench_group_2
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "is_weekly_bench_group_2=${{ github.event.schedule == '0 1 * * 0' }}" >> "${GITHUB_OUTPUT}"
-
-      - name: Check is weekly bench group 3
-        id: check_bench_group_3
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "is_weekly_bench_group_3=${{ github.event.schedule == '0 9 * * 0' }}" >> "${GITHUB_OUTPUT}"
-
-
-  run-benchmarks-8-h100-sxm5-integer:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
-    needs: prepare-inputs
+  run-benchmarks-8-h100-sxm5-summary:
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-summary
+    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit
-      op_flavor: default
-      bench_type: both
-      precisions_set: fast
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-sxm5-integer-compression:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100-SXM5x8
-      command: integer_compression
-      op_flavor: default
-      bench_type: both
-      precisions_set: fast
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-sxm5-integer-zk-aes:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk-aes
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100-SXM5x8
-      command: integer_zk,integer_aes,integer_aes256
-      op_flavor: default
-      bench_type: both
-      precisions_set: fast
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-sxm5-noise-squash:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100-SXM5x8
-      command: hlapi_noise_squash
-      op_flavor: default
-      bench_type: both
-      precisions_set: fast
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-1-h100-core-crypto:
-    name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: single-h100
-      hardware_name: n3-H100x1
-      command: pbs,pbs128,ks,ks_pbs
-      bench_type: latency
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  # -----------------------------------------------------
-  # ERC20 benchmarks
-  # -----------------------------------------------------
-
-  run-benchmarks-1-h100-erc20:
-    name: benchmark_gpu_weekly/run-benchmarks-1-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: single-h100
-      hardware_name: n3-H100x1
-      command: hlapi_erc20
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-2-h100-erc20:
-    name: benchmark_gpu_weekly/run-benchmarks-2-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: 2-h100
-      hardware_name: n3-H100x2
-      command: hlapi_erc20
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-erc20:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100
-      hardware_name: n3-H100-SXM5x8
-      command: hlapi_erc20
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  # -----------------------------------------------------
-  # DEX benchmarks
-  # -----------------------------------------------------
-
-  run-benchmarks-1-h100-dex:
-    name: benchmark_gpu_weekly/run-benchmarks-1-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: single-h100
-      hardware_name: n3-H100x1
-      command: hlapi_dex
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-2-h100-dex:
-    name: benchmark_gpu_weekly/run-benchmarks-2-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: 2-h100
-      hardware_name: n3-H100x2
-      command: hlapi_dex
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-dex:
-    name: benchmark_gpu_weekly/run-benchmarks-8-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
-    needs: prepare-inputs
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100
-      hardware_name: n3-H100-SXM5x8
-      command: hlapi_dex
+      command: summary
      bench_type: both
+      params_type: classical + multi_bit
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -12,7 +12,8 @@ on:
        default: integer
        options:
          - integer
-          - hlapi
+          - hlapi_unsigned
+          - hlapi_signed
          - hlapi_erc20
      op_flavor:
        description: "Operations set to run"
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -126,7 +126,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -50,7 +50,7 @@ jobs:
      pull-requests: write # Needed to write a comment in a pull-request
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0  # Needed to get commit hash
          persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -305,13 +305,13 @@ jobs:
      REF_NAME: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install recent Python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.12'
          pip-install: -r ci/data_extractor/requirements.txt -r ci/perf_regression/requirements.txt
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_summary.yml
+++ b/.github/workflows/benchmark_summary.yml
@@ -0,0 +1,136 @@
+# Run all benchmarks displayed in the internal documentation.
+name: benchmark_summary
+
+run-name: Benchmark Summary
+
+on:
+  workflow_dispatch:
+    inputs:
+      run-cpu-benchmarks:
+        description: "Run CPU benchmarks"
+        type: boolean
+        default: true
+      run-gpu-benchmarks:
+        description: "Run GPU benchmarks"
+        type: boolean
+        default: true
+      gpu-profile:
+        description: "GPU Instance type"
+        required: true
+        default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "8-l40 (n3-L40x8)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100-SXM5x8)"
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: both
+        options:
+          - latency
+          - throughput
+          - both
+      run-hpu-benchmarks:
+        description: "Run HPU benchmarks"
+        type: boolean
+        default: true
+
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  parse-gpu-inputs:
+    name: benchmark_summary/parse-gpu-inputs
+    if: inputs.run-gpu-benchmarks
+    runs-on: ubuntu-latest
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.gpu-profile }}
+    steps:
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks-cpu:
+    name: benchmark_documentation/run-benchmarks-cpu-integer
+    uses: ./.github/workflows/benchmark_cpu_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      command: summary
+      bench_type: ${{ inputs.bench_type }}
+      params_type: classical + multi_bit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-gpu:
+    name: benchmark_documentation/run-benchmarks-gpu
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-benchmarks
+    needs: parse-gpu-inputs
+    with:
+      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
+      command: summary
+      bench_type: ${{ inputs.bench_type }}
+      params_type: classical + multi_bit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+# TODO add make recipe for HPU benchmarks
+#  run-benchmarks-hpu:
+#    name: benchmark_documentation/run-benchmarks-hpu
+#    uses: ./.github/workflows/benchmark_hpu_common.yml
+#    if: inputs.run-hpu-benchmarks
+#    with:
+#      command: summary
+#      bench_type: ${{ inputs.bench_type }}
+#      v80_pcie_dev: 24
+#      v80_serial_number: XFL12NWY3ZKG
+#    secrets:
+#      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+#      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+#      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+#      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+#      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+#      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+#      SLAB_URL: ${{ secrets.SLAB_URL }}
+#      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+#      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -31,13 +31,16 @@ permissions: {}
 jobs:
  setup-instance:
    name: benchmark_tfhe_fft/setup-instance
+    if:
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -31,13 +31,16 @@ permissions: {}
 jobs:
  setup-instance:
    name: benchmark_tfhe_ntt/setup-instance
+    if:
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -31,15 +31,14 @@ jobs:
    name: benchmark_wasm_client/should-run
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
      pull-requests: read  # Needed to check for file change
    outputs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -59,171 +58,19 @@ jobs:
              - tfhe/web_wasm_parallel_tests/**
              - .github/workflows/wasm_client_benchmark.yml

-  setup-instance:
-    name: benchmark_wasm_client/setup-instance
+  run-benchmarks-cpu-zk-client:
+    name: benchmark_documentation/run-benchmarks-cpu-zk-client
+    uses: ./.github/workflows/benchmark_wasm_client_common.yml
+    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
-    needs: should-run
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-  wasm-client-benchmarks:
-    name: benchmark_wasm_client/wasm-client-benchmarks
-    needs: setup-instance
-    if: needs.setup-instance.result != 'skipped'
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      max-parallel: 1
-      matrix:
-        browser: [ chrome, firefox ]
-    steps:
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=${COMMIT_DATE}";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-        env:
-          SHA: ${{ github.sha }}
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: nightly
-
-      - name: Get Node version
-        run: |
-          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
-
-      - name: Node cache restoration
-        id: node-cache
-        uses: actions/cache/restore@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install Node
-        if: steps.node-cache.outputs.cache-hit != 'true'
-        run: |
-          make install_node
-
-      - name: Node cache save
-        uses: actions/cache/save@8b402f58fbc84540c8b491a91e594a4576fec3d7 #v5.0.2
-        if: steps.node-cache.outputs.cache-hit != 'true'
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install web resources
-        run: |
-          make install_"${BROWSER}"_browser
-          make install_"${BROWSER}"_web_driver
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Run benchmarks
-        run: |
-          make bench_web_js_api_parallel_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Run benchmarks (unsafe coop)
-        run: |
-          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Parse results
-        run: |
-          make parse_wasm_benchmarks
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
-          --database tfhe_rs \
-          --hardware "m6i.4xlarge" \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --key-gen
-          rm tfhe-benchmark/wasm_pk_gen.csv
-        env:
-          REF_NAME: ${{ github.ref_name }}
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
-        with:
-          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
-        with:
-          repository: zama-ai/slab
-          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
-          --slab-url "${SLAB_URL}"
-        env:
-          JOB_SECRET: ${{ secrets.JOB_SECRET }}
-          SLAB_URL: ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: benchmark_wasm_client/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, wasm-client-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -0,0 +1,234 @@
+# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
+name: benchmark_wasm_client_common
+
+on:
+  workflow_call:
+    inputs:
+      browser:
+        type: string # Use comma separated values to generate an array
+        default: chrome,firefox
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members and GitHub can trigger this workflow
+
+jobs:
+  prepare-matrix:
+    name: benchmark_wasm_client_common/prepare-matrix
+    runs-on: ubuntu-latest
+    outputs:
+      browser: ${{ steps.set_matrix_arg.outputs.browser }}
+    steps:
+      - name: Parse user inputs
+        shell: python
+        env:
+          INPUTS_BROWSER: ${{ inputs.browser }}
+        run: |
+          import os
+
+          inputs_browser = os.environ["INPUTS_BROWSER"]
+          env_file = os.environ["GITHUB_ENV"]
+
+          split_browser = inputs_browser.replace(" ", "").split(",")
+
+          with open(env_file, "a") as f:
+            f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
+
+      - name: Set martix arguments output
+        id: set_matrix_arg
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: benchmark_wasm_client_common/setup-instance
+    needs: prepare-matrix
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  wasm-client-benchmarks:
+    name: benchmark_wasm_client_common/wasm-client-benchmarks
+    needs: [ prepare-matrix, setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      max-parallel: 1
+      matrix:
+        browser: ${{ fromJSON(needs.prepare-matrix.outputs.browser) }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Get Node version
+        run: |
+          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
+
+      - name: Node cache restoration
+        id: node-cache
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install Node
+        if: steps.node-cache.outputs.cache-hit != 'true'
+        run: |
+          make install_node
+
+      - name: Node cache save
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        if: steps.node-cache.outputs.cache-hit != 'true'
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install web resources
+        run: |
+          make install_"${BROWSER}"_browser
+          make install_"${BROWSER}"_web_driver
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_web_js_api_parallel_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Run benchmarks (unsafe coop)
+        run: |
+          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Parse results
+        run: |
+          make parse_wasm_benchmarks
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "m6i.4xlarge" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --key-gen
+          rm tfhe-benchmark/wasm_pk_gen.csv
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        with:
+          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: benchmark_wasm_client_common/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, wasm-client-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -24,9 +24,11 @@ permissions: {}
 jobs:
  audit:
    name: cargo_audit/audit
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -24,7 +24,7 @@ jobs:
    outputs:
      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -57,9 +57,7 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  SLACKIFY_MARKDOWN: true
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
+
  LINELINT_VERSION: 0.0.6
  LINELINT_CHECKSUM: "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b"

@@ -69,37 +67,10 @@ permissions:
 # zizmor: ignore[concurrency-limits] caller workflow is responsible for the concurrency

 jobs:
-  setup-instance:
-    name: cargo_build_common/setup-instance
-    if: inputs.run-pcc-cpu-batch || inputs.run-pcc-hpu || inputs.run-build || inputs.run-build-layers || inputs.run-build-tfhe-full || inputs.run-build-c-api
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      run_attempt: ${{ github.run_attempt }} # On a re-run with a successful previous run for this job, the run_attempt will not be incremented
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
  prepare-matrix:
    name: cargo_build_common/prepare-matrix
+    if: inputs.run-pcc-cpu-batch || inputs.run-pcc-hpu || inputs.run-build || inputs.run-build-layers || inputs.run-build-tfhe-full || inputs.run-build-c-api
    runs-on: ubuntu-latest
-    needs: setup-instance
    outputs:
      runners: ${{ steps.set_matrix_runners.outputs.runners }}
    steps:
@@ -107,12 +78,12 @@ jobs:
        shell: python
        env:
          INPUTS_EXTRA_RUNNERS_TO_USE: ${{ inputs.extra-runners-to-use }}
-          REMOTE_RUNNER_LABEL: ${{ needs.setup-instance.outputs.runner-name }}
+          REMOTE_RUNNER: "runs-on=${{ github.run_id }}/runner=cpu-small"
        run: |
          import os
          
          inputs_extra_runners = os.environ["INPUTS_EXTRA_RUNNERS_TO_USE"]
-          remote_runner_label = os.environ["REMOTE_RUNNER_LABEL"]
+          remote_runner_label = os.environ["REMOTE_RUNNER"]
          env_file = os.environ["GITHUB_ENV"]
          
          runners = [remote_runner_label, ]
@@ -130,7 +101,7 @@ jobs:

  builds:
    name: cargo_build_common/builds
-    needs: [ setup-instance, prepare-matrix ]
+    needs: prepare-matrix
    runs-on: ${{ matrix.runner }}
    strategy:
      matrix:
@@ -140,7 +111,7 @@ jobs:
      result: ${{ steps.set_builds_result.outputs.result }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -159,6 +130,35 @@ jobs:
          chmod +x linelint-linux-amd64
          ln -s "$(pwd)/linelint-linux-amd64" /usr/local/bin/linelint

+      - name: Get Node version
+        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
+        run: |
+          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
+
+      - name: Node cache restoration
+        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
+        id: node-cache
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install Node
+        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
+        run: |
+          make install_node
+
+      - name: Node cache save
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
      - name: Run pcc checks batch
        if: inputs.run-pcc-cpu-batch
        run: |
@@ -230,29 +230,3 @@ jobs:
        if: ${{ always() }}
        run: | # zizmor: ignore[template-injection] this context variable is safe
          echo "result=${{ job.status }}" >> "${GITHUB_OUTPUT}"
-
-  teardown-instance:
-    name: cargo_build_common/teardown-instance
-    if: ${{ always() &&
-      needs.setup-instance.result == 'success' &&
-      github.run_attempt == needs.setup-instance.outputs.run_attempt }} # Only run if setup-instance has been executed during this run attempt
-    needs: [setup-instance, builds]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cargo-builds) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -26,7 +26,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -24,7 +24,7 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -2,6 +2,7 @@
 name: cargo_test_fft

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -22,6 +23,8 @@ permissions:
 jobs:
  should-run:
    name: cargo_test_fft/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -29,7 +32,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -56,7 +59,7 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +95,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -2,6 +2,7 @@
 name: cargo_test_ntt

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -24,6 +25,8 @@ permissions:
 jobs:
  should-run:
    name: cargo_test_ntt/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -31,7 +34,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -60,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,7 +90,7 @@ jobs:
        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -143,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -18,7 +18,7 @@ jobs:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
        with:
-          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
+          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc|perf)(\([\w\-_]+\))?\!?\:) .+$'
          flags: "gs"
          error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
          excludeDescription: "true" # optional: this excludes the description body of a pull request
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -43,14 +43,14 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
+        uses: zizmorcore/zizmor-action@0dce2577a4760a2749d8cfb7a84b7d5585ebcb7d # v0.5.0
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6124774845927d14c601359ab8138699fa5b70c3 # v4.0.1
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@d5d20e15f2736816ee0e001ba8b24b54d9ffcff4 # v5.0.0
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -23,34 +23,16 @@ permissions:
 # zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow

 jobs:
-  setup-instance:
-    name: code_coverage/setup-instance
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
  code-coverage-tests:
    name: code_coverage/code-coverage-tests
-    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}_${{ github.event_name }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -121,26 +103,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: code_coverage/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, code-coverage-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (code-coverage-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -10,10 +10,10 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
+

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,42 +27,47 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  setup-instance:
-    name: csprng_randomness_tests/setup-instance
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+  should-run:
+    name: csprng_randomness_tests/should-run
+    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      csprng_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.csprng_any_changed }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            csprng:
+              - Cargo.toml
+              - tfhe/Cargo.toml
+              - tfhe-csprng/**
+              - utils/tfhe-versionable/**
+              - .github/workflows/csprng_randomness_tests.yml

  csprng-randomness-tests:
    name: csprng_randomness_tests/csprng-randomness-tests
-    needs: setup-instance
+    needs: should-run
+    if: github.event_name == 'workflow_dispatch' ||
+      (contains(github.event.label.name, 'approved') && needs.should-run.outputs.csprng_test == 'true')
    concurrency:
      group: ${{ github.workflow_ref }}_${{ github.sha }}_${{ github.event_name }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -76,34 +81,18 @@ jobs:
        run: |
          make dieharder_csprng

+      - name: Set pull-request URL
+        if: ${{ failure() && github.event_name == 'pull_request' }}
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: csprng_randomness_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, csprng-randomness-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -9,6 +9,9 @@ on:
        type: string
      layer:
        type: string
+      bench_subset:
+        type: string
+        default: all
      pbs_kind: # Valid values are 'classical', 'multi_bit' or 'any'
        type: string
      grouping_factor: # Valid values are 2, 3, or 4
@@ -16,6 +19,9 @@ on:
        default: 4
      bench_type: # Valid values are 'latency', 'throughput'
        type: string
+      name_suffix:
+        type: string
+        default: _mean_avx512
      backend_comparison:
        type: boolean
        default: false
@@ -43,7 +49,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'

@@ -60,6 +66,8 @@ jobs:
          --pbs-kind "${PBS_KIND}" \
          --grouping-factor "${GROUPING_FACTOR}" \
          --bench-type "${BENCH_TYPE}" \
+          --bench-subset "${BENCH_SUBSET}" \
+          --name-suffix "${NAME_SUFFIX}" \
          --time-span-days "${TIME_SPAN}"
        env:
          OUTPUT_FILENAME: ${{ inputs.output_filename }}
@@ -70,6 +78,8 @@ jobs:
          PBS_KIND: ${{ inputs.pbs_kind }}
          GROUPING_FACTOR: ${{ inputs.grouping_factor }}
          BENCH_TYPE: ${{ inputs.bench_type }}
+          BENCH_SUBSET: ${{ inputs.bench_subset }}
+          NAME_SUFFIX: ${{ inputs.name_suffix }}
          TIME_SPAN: ${{ inputs.time_span_days }}
          DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
          DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
@@ -79,7 +89,7 @@ jobs:
        if: inputs.backend_comparison == false
        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
-          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
+          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
          # This will upload all the file generated
          path: ${{ inputs.output_filename }}*.svg
          retention-days: 60
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -51,7 +51,7 @@ jobs:
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

  cpu-integer-throughput-table:
-    name: generate_documentation_svgs/cpu-integer-latency-table
+    name: generate_documentation_svgs/cpu-integer-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
@@ -150,6 +150,124 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

+  # -----------------------------------------------------------
+  # ZK benchmarks tables
+  # -----------------------------------------------------------
+
+  cpu-zk-server-latency-table:
+    name: generate_documentation_svgs/cpu-zk-server-latency-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-cpu-svgs
+    with:
+      backend: cpu
+      hardware_name: hpc7a.96xlarge
+      layer: integer
+      bench_subset: zk
+      pbs_kind: classical
+      bench_type: latency
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: cpu-zk-benchmark-latency
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  cpu-zk-server-throughput-table:
+    name: generate_documentation_svgs/cpu-zk-server-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-cpu-svgs
+    with:
+      backend: cpu
+      hardware_name: hpc7a.96xlarge
+      layer: integer
+      bench_subset: zk
+      pbs_kind: classical
+      bench_type: throughput
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: cpu-zk-benchmark-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  cpu-zk-client-latency-table:
+    name: generate_documentation_svgs/cpu-zk-client-latency-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-cpu-svgs
+    with:
+      backend: cpu
+      hardware_name: m6i.4xlarge
+      layer: wasm
+      bench_subset: zk
+      pbs_kind: classical
+      bench_type: latency
+      name_suffix: _chrome_mean
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: cpu-zk-wasm-benchmark-latency
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  # -----------------------------------------------------------
+  # ERC20 benchmarks tables
+  # -----------------------------------------------------------
+
+  cpu-erc20-latency-throughput-table:
+    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-cpu-svgs
+    with:
+      backend: cpu
+      hardware_name: hpc7a.96xlarge
+      layer: hlapi
+      bench_subset: erc20
+      pbs_kind: classical
+      bench_type: both
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  gpu-erc20-latency-throughput-table:
+    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: hlapi
+      bench_subset: erc20
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: both
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  hpu-erc20-latency-throughput-table:
+    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-hpu-svgs
+    with:
+      backend: hpu
+      hardware_name: hpu_x1
+      layer: hlapi
+      bench_subset: erc20
+      pbs_kind: classical
+      bench_type: both
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
  # -----------------------------------------------------------
  # PBS benchmarks tables
  # -----------------------------------------------------------
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -19,8 +19,8 @@ on:
  pull_request:
    types: [ labeled ]
  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"
+   # Every other day at 1AM
+   - cron: "0 1 */2 * *"

 permissions:
  contents: read
@@ -37,11 +37,11 @@ jobs:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ["self-hosted", "4090-desktop"]
-    timeout-minutes: 1440 # 24 hours
+    timeout-minutes: 2880 # 48 hours

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -23,8 +23,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # every month
-    - cron: "0 0 1 * *"
+    # every friday noon
+    - cron: "0 12 * * 5"

 permissions:
  contents: read
@@ -35,15 +35,15 @@ jobs:
  setup-instance:
    name: gpu_code_validation_tests/setup-instance
    runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -93,6 +93,11 @@ jobs:

      - name: Find tools
        run: |
+          # Disable unattended-upgrades to avoid lock issues
+          sudo systemctl disable --now unattended-upgrades
+
+          sudo apt-get clean
+          sudo rm -rf /var/lib/apt/lists/*
          sudo apt update && sudo apt install -y valgrind 
          find /usr -executable -name "compute-sanitizer"
          which valgrind
@@ -106,6 +111,10 @@ jobs:
        run: |
          make test_high_level_api_gpu_valgrind

+      - name: Run CUDA backend racecheck tests
+        run: |
+          make test_cuda_backend_race_check
+
  slack-notify:
    name: gpu_code_validation_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
@@ -137,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: gpu_fast_h100_tests
+name: gpu_core_h100_tests

 env:
  CARGO_TERM_COLOR: always
@@ -32,7 +32,7 @@ permissions:

 jobs:
  should-run:
-    name: gpu_fast_h100_tests/should-run
+    name: gpu_core_h100_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -61,15 +61,12 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
-              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
-              - '.github/workflows/gpu_fast_h100_tests.yml'
-              - scripts/integer-tests.sh
-              - ci/slab.toml
+              - '.github/workflows/gpu_core_h100_tests.yml'

  setup-instance:
-    name: gpu_fast_h100_tests/setup-instance
+    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -87,7 +84,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,7 +108,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: gpu_fast_h100_tests/cuda-tests-linux
+    name: gpu_core_h100_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -129,7 +126,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -155,20 +152,8 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

-      - name: Run user docs tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
-
  slack-notify:
-    name: gpu_fast_h100_tests/slack-notify
+    name: gpu_core_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -187,10 +172,10 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+          SLACK_MESSAGE: "Core H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: gpu_fast_h100_tests/teardown-instance
+    name: gpu_core_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
@@ -198,7 +183,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -39,7 +39,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -64,8 +64,6 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_fast_tests.yml'
-              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_fast_tests/setup-instance
@@ -79,7 +77,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +112,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -151,7 +149,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          make test_high_level_api_gpu_fast

  slack-notify:
    name: gpu_fast_tests/slack-notify
@@ -184,7 +182,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,7 +68,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -66,7 +66,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/**_multi_gpu_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_full_multi_gpu_tests/setup-instance
@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +115,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,7 +153,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          make test_high_level_api_gpu_fast

  slack-notify:
    name: gpu_full_multi_gpu_tests/slack-notify
@@ -187,7 +186,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -0,0 +1,207 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: gpu_hlapi_h100_tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]
+
+permissions:
+  contents: read
+
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
+
+jobs:
+  should-run:
+    name: gpu_hlapi_h100_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
+              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+
+  setup-instance:
+    name: gpu_hlapi_h100_tests/setup-instance
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cuda-tests-linux:
+    name: gpu_hlapi_h100_tests/cuda-tests-linux
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11 
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
+      
+      - name: Run user docs tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
+
+  slack-notify:
+    name: gpu_hlapi_h100_tests/slack-notify
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Set pull-request URL
+        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Send message
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "HL API H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: gpu_hlapi_h100_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,7 +65,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -31,18 +31,50 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  should-run:
+    name: gpu_memory_sanitizer/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            gpu:
+              - Cargo.toml
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - '.github/workflows/gpu_memory_sanitizer.yml'
+
  setup-instance:
    name: gpu_memory_sanitizer/setup-instance
+    needs: should-run
    runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +110,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -31,18 +31,50 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  should-run:
+    name: gpu_memory_sanitizer_h100/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            gpu:
+              - Cargo.toml
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - '.github/workflows/gpu_memory_sanitizer_h100.yml'
+
  setup-instance:
    name: gpu_memory_sanitizer/setup-instance
+    needs: should-run
    runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +110,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -74,12 +74,12 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

-      - name: Install CUDA
+      - name: Install CUDA and other dependencies
        if: env.SECRETS_AVAILABLE == 'false'
        shell: bash
        run: |
@@ -90,6 +90,12 @@ jobs:
          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
          sha256sum -c checksum
          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
+
+          # Disable unattended-upgrades to avoid lock issues
+          sudo systemctl disable --now unattended-upgrades
+
+          sudo apt-get clean
+          sudo rm -rf /var/lib/apt/lists/*
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
        env:
@@ -130,6 +136,17 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          # Disable unattended-upgrades to avoid lock issues
+          sudo systemctl disable --now unattended-upgrades
+
+          sudo apt-get clean
+          sudo rm -rf /var/lib/apt/lists/*
+          sudo apt update
+          sudo apt -y install python3-venv
+          make semgrep_and_lint_gpu_code
+
      - name: Check build with hpu enabled
        run: |
          make clippy_gpu_hpu
@@ -159,7 +176,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -66,7 +66,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_signed_integer_classic_tests/setup-instance
@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +115,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +169,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -66,7 +66,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
@@ -87,7 +86,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +128,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +183,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -67,7 +67,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_signed_integer_tests/setup-instance
@@ -82,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +116,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +178,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -66,7 +66,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_unsigned_integer_classic_tests/setup-instance
@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +115,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +169,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -66,7 +66,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
@@ -87,7 +86,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +128,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +183,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -67,7 +67,6 @@ jobs:
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh
-              - ci/slab.toml

  setup-instance:
    name: gpu_unsigned_integer_tests/setup-instance
@@ -82,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +116,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +178,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -0,0 +1,182 @@
+# Compile and test zk-cuda-backend
+name: gpu_zk_tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+permissions:
+  contents: read
+
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
+
+jobs:
+  should-run:
+    name: gpu_zk_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/zk-cuda-backend/**
+              - tfhe/src/integer/gpu/zk/**
+              - tfhe-zk-pok/**
+              - 'tfhe/docs/**/**.md'
+              - '.github/workflows/gpu_zk_tests.yml'
+              - ci/slab.toml
+
+  setup-instance:
+    name: gpu_zk_tests/setup-instance
+    needs: should-run
+    if: github.event_name == 'workflow_dispatch' ||
+      needs.should-run.outputs.gpu_test == 'true'
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: gpu-test
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cuda-tests-linux:
+    name: gpu_zk_tests/cuda-tests-linux
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11 
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
+
+      - name: Run zk-cuda-backend integration tests
+        run: |
+          make test_zk_cuda_backend
+          make test_zk_pok_gpu
+          make test_integer_zk_gpu
+          make test_integer_zk_experimental_gpu
+
+  slack-notify:
+    name: gpu_zk_tests/slack-notify
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Set pull-request URL
+        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Send message
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "ZK GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: gpu_zk_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -2,6 +2,7 @@
 name: hpu_hlapi_tests

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -12,9 +13,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
+

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -25,6 +24,8 @@ permissions: {}
 jobs:
  should-run:
    name: hpu_hlapi_tests/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -32,7 +33,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -49,41 +50,15 @@ jobs:
              - backends/tfhe-hpu-backend/**
              - mockups/tfhe-hpu-mockup/**

-  setup-instance:
-    name: hpu_hlapi_tests/setup-instance
+  cargo-tests-hpu:
+    name: hpu_hlapi_tests/cargo-tests-hpu (bpr)
    needs: should-run
    if:
      needs.should-run.outputs.hpu_test == 'true' &&
      ((github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||github.event_name == 'pull_request')
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
-  cargo-tests-hpu:
-    name: hpu_hlapi_tests/cargo-tests-hpu (bpr)
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -104,27 +79,3 @@ jobs:
          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
          make HPU_CONFIG=sim test_high_level_api_hpu
          make HPU_CONFIG=sim test_user_doc_hpu
-
-  teardown-instance:
-    name: hpu_hlapi_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [setup-instance, cargo-tests-hpu]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (hpu_hlapi_tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -24,36 +24,18 @@ permissions: {}
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  setup-instance:
-    name: integer_long_run_tests/setup-instance
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
  cpu-tests:
    name: integer_long_run_tests/cpu-tests
-    needs: [ setup-instance ]
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-big"
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -74,26 +56,3 @@ jobs:
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU long run tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: integer_long_run_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, cpu-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -41,7 +41,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -52,7 +52,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -75,6 +75,7 @@ jobs:
    name: make_release_common/provenance
    if: ${{ !inputs.dry-run  }}
    needs: package
+    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      actions: read # Needed to detect the GitHub Actions environment
@@ -93,7 +94,7 @@ jobs:
      id-token: write # Needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -117,6 +117,7 @@ jobs:
    name: make_release_cuda/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
+    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      actions: read # Needed to detect the GitHub Actions environment
@@ -221,7 +222,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -68,7 +68,7 @@ jobs:
      id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -92,7 +92,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
+        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
+        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -5,9 +5,8 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"
+
+  SAGEMATH_VERSION: 10.8

 on:
  pull_request:
@@ -26,41 +25,15 @@ permissions: {}
 # zizmor: ignore[concurrency-limits] only Zama organization members and GitHub can trigger this workflow

 jobs:
-  setup-instance:
-    name: parameters_check/setup-instance
-    if:
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
  params-curves-security-check:
    name: parameters_check/params-curves-security-check
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name != 'push'
+    runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,17 +44,42 @@ jobs:
          toolchain: stable

      - name: Checkout lattice-estimator
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: malb/lattice-estimator
-          path: lattice_estimator
+          path: lattice-estimator
          ref: '352ddaf4a288a0543f5d9eb588d2f89c7acec463'
          persist-credentials: 'false'

-      - name: Install Sage
+      - name: Restore Sagemath image from cache
+        id: docker-cache
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        with:
+          path: /tmp/sagemath_image
+          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
+          restore-keys: sagemath-image-
+
+      - name: Load cached Docker sagemath image
+        if: steps.docker-cache.outputs.cache-hit == 'true'
        run: |
-          sudo apt update
-          sudo apt install -y sagemath
+          docker load -i /tmp/sagemath_image/sagemath.tar
+
+      - name: Pull Docker sagemath image
+        if: steps.docker-cache.outputs.cache-hit != 'true'
+        run: |
+          docker pull sagemath/sagemath:"${VERSION}"
+          mkdir -p /tmp/sagemath_image
+          docker save sagemath/sagemath:"${VERSION}" -o /tmp/sagemath_image/sagemath.tar
+        env:
+          VERSION: ${{ env.SAGEMATH_VERSION }}
+
+      - name: Store Sagemath image in cache
+        if: steps.docker-cache.outputs.cache-hit != 'true'
+        continue-on-error: true
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        with:
+          path: /tmp/sagemath_image
+          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}

      - name: Collect parameters
        run: |
@@ -95,7 +93,9 @@ jobs:

      - name: Perform security check
        run: |
-          PYTHONPATH=lattice_estimator sage ci/lattice_estimator.sage
+          docker run \
+          -v "${PWD}":/repo_src \
+          sagemath/sagemath:10.8 /bin/bash /repo_src/scripts/execute_lattice_estimator.sh

      - name: Get time elapsed
        if: ${{ always() }}
@@ -127,27 +127,3 @@ jobs:
          SLACK_MESSAGE: "Security check for parameters finished with status: ${{ job.status }} (analysis took: ${{ env.TIME_ELAPSED }} mins). (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-  teardown-instance:
-    name: parameters_check/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [setup-instance, params-curves-security-check]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (params-curves-security-check) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/pr_milestone_check.yml
+++ b/.github/workflows/pr_milestone_check.yml
@@ -1,67 +0,0 @@
-name: pr_milestone_check
-
-on:
-  pull_request:
-    types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-# external contributors workflows are manually approved
-
-jobs:
-  check-empty-milestone:
-    name: pr_milestone_check/check-empty-milestone
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone == null
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone Missing
-
-            Please assign a milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is missing. This check is failing."
-          exit 1
-
-  check-milestone-open:
-    name: pr_milestone_check/check-milestone-open
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone is closed
-
-            Please assign an open milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is closed. This check is failing."
-          exit 1
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -30,7 +30,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
+          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -47,6 +47,8 @@ jobs:

          echo ">>> Pushing all LFS items..."
          git lfs push --all destination "${DESTINATION_BRANCH}"
+          
+          shred --remove .git/config

      - name: git-sync-tags
        env:
@@ -59,7 +61,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
+          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -70,3 +72,5 @@ jobs:

          echo ">>> Pushing git changes..."
          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
+          
+          shred --remove .git/config
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -12,12 +12,13 @@ permissions: {}
 jobs:
  stale:
    name: unverified_prs/stale
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    permissions:
      issues: read # Needed to fetch all issues
      pull-requests: write # Needed to write message and close the PR
    steps:
-      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0
        with:
          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
          days-before-stale: 2
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
+backends/tfhe-cuda-backend/cuda/build/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -9,7 +9,7 @@ ignore:
  - tfhe/web_wasm_parallel_tests/dist
  - keys
  - coverage
-  - utils/tfhe-lints/ui/main.stderr
+  - utils/tfhe-lints/tests/*/main.stderr
  - utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated

 rules:
--- a/28
+++ b/28
@@ -2,35 +2,37 @@
 # i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file

-/backends/tfhe-cuda-backend/            @agnesLeroy
+/backends/tfhe-cuda-backend/            @zama-ai/gpu
+/backends/zk-cuda-backend/              @zama-ai/gpu
 /backends/tfhe-hpu-backend/             @zama-ai/hardware

 /tfhe/examples/hpu                      @zama-ai/hardware

-/tfhe/src/core_crypto/                  @IceTDrinker
-/tfhe/src/core_crypto/gpu               @agnesLeroy
+/tfhe/src/core_crypto/                  @IceTDrinker @mayeul-zama
+/tfhe/src/core_crypto/gpu               @zama-ai/gpu
 /tfhe/src/core_crypto/hpu               @zama-ai/hardware

 /tfhe/src/shortint/                     @mayeul-zama @nsarlin-zama

-/tfhe/src/integer/                      @tmontaigu
-/tfhe/src/integer/gpu                   @agnesLeroy
+/tfhe/src/integer/                      @tmontaigu @nsarlin-zama
+/tfhe/src/integer/gpu                   @zama-ai/gpu
 /tfhe/src/integer/hpu                   @zama-ai/hardware

-/tfhe/src/high_level_api/               @tmontaigu
+/tfhe/src/high_level_api/               @tmontaigu @nsarlin-zama

-/tfhe-zk-pok/                           @nsarlin-zama
+/tfhe-zk-pok/                           @nsarlin-zama @tmontaigu
+/tfhe-zk-pok/src/gpu                    @zama-ai/gpu

-/tfhe-benchmark/                        @soonum
+/tfhe-benchmark/                        @soonum @SouchonTheo

-/utils/                                 @nsarlin-zama
+/utils/                                 @nsarlin-zama @SouchonTheo

 /Makefile                               @IceTDrinker @soonum

 /mockups/tfhe-hpu-mockup                @zama-ai/hardware

-/.github/                               @soonum
-/ci/                                    @soonum
-/scripts/                               @soonum
+/.github/                               @soonum @SouchonTheo
+/ci/                                    @soonum @SouchonTheo
+/scripts/                               @soonum @SouchonTheo

-/CODEOWNERS                             @IceTDrinker
+/CODEOWNERS                             @IceTDrinker @nsarlin-zama
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,7 +17,7 @@ Start by [forking](https://docs.github.com/en/pull-requests/collaborating-with-p
 - **Performance**: For optimal performance, it is highly recommended to run **TFHE-rs** code in release mode with cargo's `--release` flag.
 {% endhint %}

-To get more details about the library, please refer to the [documentation](https://docs.zama.ai/tfhe-rs).
+To get more details about the library, please refer to the [documentation](https://docs.zama.org/tfhe-rs).

 ## 2. Creating a new branch

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,10 +9,12 @@ members = [
    "tasks",
    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
+    "backends/zk-cuda-backend",
    "backends/tfhe-hpu-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/tfhe-backward-compat-data",
+    "utils/tfhe-backward-compat-data/crates/add_new_version",
    "utils/param_dedup",
    "tests",
    "mockups/tfhe-hpu-mockup",
@@ -26,6 +28,8 @@ rust-version = "1.91.1"

 [workspace.dependencies]
 aligned-vec = { version = "0.6", default-features = false }
+ark-ec = "0.5.0"
+ark-ff = "0.5.0"
 bytemuck = "1.24"
 dyn-stack = { version = "0.13", default-features = false }
 itertools = "0.14"
@@ -37,7 +41,11 @@ serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.101"
 getrandom = "0.2.8"
 # The project maintainers consider that this is the last version of the 1.3 branch, any newer version should not be trusted
+bindgen = "0.71"
 bincode = "=1.3.3"
+cmake = "0.1"
+pkg-config = "0.3"
+clap = { version = "4.5", features = ["derive"] }

 [profile.bench]
 lto = "fat"
--- a/313
+++ b/313
@@ -1,4 +1,7 @@
 SHELL:=$(shell /usr/bin/env which bash)
+# Enable stop on error, no undefined variables
+# the c flag is to run the script inline
+.SHELLFLAGS := -eu -c
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat nightly-toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
@@ -55,6 +58,9 @@ REGEX_PATTERN?=''
 # tfhe-cuda-backend
 TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
+ZKCUDA_SRC=backends/zk-cuda-backend/cuda
+ZKCUDA_BUILD=$(ZKCUDA_SRC)/build
+ZKCUDARS_SRC=backends/zk-cuda-backend/src

 # tfhe-hpu-backend
 HPU_CONFIG=v80
@@ -264,12 +270,23 @@ install_mlc:
 	cargo install mlc --locked || \
 	( echo "Unable to install mlc, unknown error." && exit 1 )

+fmt: FMT_CHECK =
 .PHONY: fmt # Format rust code
-fmt: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt
+fmt: fmt_internal
+
+check_fmt: FMT_CHECK = --check
+.PHONY: check_fmt # Check rust code format
+check_fmt: fmt_internal
+
+.PHONY: fmt_internal # internal recipe for fmt
+fmt_internal: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt $(FMT_CHECK)
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt $(FMT_CHECK)
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt $(FMT_CHECK)
+	for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
+		echo "fmt $$crate"; \
+		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate fmt $(FMT_CHECK); \
+	done

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -279,10 +296,24 @@ fmt_js: check_nvm_installed
 	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt && \
 	$(MAKE) -C tfhe/js_on_wasm_tests fmt

+.PHONY: semgrep_lint_setup_venv # Create venv and install Python dependencies for GPU lint checks
+semgrep_lint_setup_venv:
+	python3 -m venv venv
+	venv/bin/pip install -r scripts/gpu-lint-requirements.txt
+
+.PHONY: semgrep_and_lint_gpu_code # Run semgrep and lint checks on CUDA backend code
+semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
+	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
+		| grep -v '/cmake-build-debug/' \
+		| grep -v '/build/' \
+		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
+	venv/bin/python3 "scripts/check_scratch_cleanup.py"
+
 .PHONY: fmt_gpu # Format rust and cuda code
 fmt_gpu: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh
+	cd "$(ZKCUDA_SRC)" && ./format_zk_cuda_backend.sh

 .PHONY: fmt_c_tests # Format c tests
 fmt_c_tests:
@@ -292,13 +323,6 @@ fmt_c_tests:
 fmt_toml: install_taplo
 	taplo fmt

-.PHONY: check_fmt # Check rust code format
-check_fmt: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check
-
 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
 	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
@@ -307,6 +331,7 @@ check_fmt_c_tests:
 check_fmt_gpu: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
+	cd "$(ZKCUDA_SRC)" && ./format_zk_cuda_backend.sh -c

 .PHONY: check_fmt_js # Check javascript code format
 check_fmt_js: check_nvm_installed
@@ -328,14 +353,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
 		--all-targets \
 		-p tfhe

@@ -349,7 +374,7 @@ clippy_hpu: install_rs_check_toolchain
 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

@@ -442,7 +467,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	fi && \
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
 		-p tfhe -- --nocapture

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -518,11 +543,10 @@ clippy_param_dedup: install_rs_check_toolchain

 .PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
 clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-backward-compat-data -- --no-deps -D warnings
 	@# Some old crates are x86 specific, only run in that case
 	@if uname -a | grep -q x86; then \
-		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all --all-targets \
-			-- --no-deps -D warnings; \
 		for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
 			echo "checking $$crate"; \
 			RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
@@ -551,6 +575,8 @@ clippy_core clippy_tfhe_csprng
 clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p zk-cuda-backend -- --no-deps -D warnings

 .PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
 clippy_hpu_backend: install_rs_check_toolchain
@@ -644,7 +670,7 @@ build_c_api: install_rs_check_toolchain
 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
 		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -722,8 +748,28 @@ test_cuda_backend:
 		"$(MAKE)" -j "$(CPU_COUNT)" && \
 		"$(MAKE)" test

+.PHONY: test_cuda_backend_race_check # Build and run selected CUDA backend tests with Compute Sanitizer racecheck
+test_cuda_backend_race_check:
+	mkdir -p "$(TFHECUDA_BUILD)" && \
+		cd "$(TFHECUDA_BUILD)" && \
+		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
+		"$(MAKE)" -j "$(CPU_COUNT)" test_tfhe_cuda_backend && \
+		compute-sanitizer --tool racecheck --target-processes all ./tests_and_benchmarks/tests/test_tfhe_cuda_backend \
+			--gtest_filter="*ClassicalProgrammableBootstrap*:*MultiBitProgrammableBootstrap*"
+
+.PHONY: test_zk_cuda_backend # Run the internal tests of the CUDA ZK backend
+test_zk_cuda_backend:
+	mkdir -p "$(ZKCUDA_BUILD)" && \
+		cd "$(ZKCUDA_BUILD)" && \
+		cmake .. -DCMAKE_BUILD_TYPE=Release -DZK_CUDA_BACKEND_BUILD_TESTS=ON && \
+		"$(MAKE)" -j "$(CPU_COUNT)" && \
+		"$(MAKE)" test
+	cd "$(ZKCUDARS_SRC)" && \
+		cargo test --release
+
+
 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu:
@@ -733,11 +779,12 @@ test_core_crypto_gpu:
 		--features=gpu -p tfhe -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4
+test_integer_gpu: install_cargo_nextest
+	TEST_THREADS=2 \
+	DOCTEST_THREADS=4 \
+		./scripts/integer-tests.sh \
+		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
+		--tfhe-package "tfhe" --all-but-noise

 .PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
 test_integer_gpu_debug:
@@ -762,7 +809,7 @@ test_integer_hl_test_gpu_check_warnings:
 		--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p tfhe &> /tmp/gpu_compile_output
 	WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning #" | grep "\[tfhe-cuda-backend" | grep -v "inline qualifier" || true) && \
 	if [[ "$${WARNINGS}" != "" ]]; then \
-	    echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
+		echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
 		echo "$${WARNINGS}" && exit 1; \
 	fi

@@ -1049,10 +1096,16 @@ test_high_level_api:
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

-test_high_level_api_gpu: install_cargo_nextest
+test_high_level_api_gpu_fast: install_cargo_nextest # Run all the GPU tests for high_level_api except test_uniformity for oprf which is too long
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-		-E "test(/high_level_api::.*gpu.*/)"
+	  -E "test(/high_level_api::.*gpu.*/) and not test(/uniformity/)"
+
+
+test_high_level_api_gpu: install_cargo_nextest # Run all the GPU tests for high_level_api
+	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
+		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+  	-E "test(/high_level_api::.*gpu.*/)"

 test_list_gpu: install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest list --cargo-profile $(CARGO_PROFILE) \
@@ -1152,12 +1205,31 @@ test_tfhe_csprng_big_endian: install_cargo_cross
 	RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu

-
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok --features experimental

+.PHONY: test_zk_pok_gpu # Run tfhe-zk-pok GPU-accelerated tests
+test_zk_pok_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
+
+.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
+test_integer_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
+		--features=integer,zk-pok,gpu -p tfhe -- \
+		integer::gpu::zk::
+
+.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
+test_integer_zk_experimental_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
+		--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
+		integer::gpu::zk::
+
+.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
+test_zk_cuda: install_rs_check_toolchain test_zk_cuda_backend test_zk_pok_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
+
 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
@@ -1371,6 +1443,9 @@ clippy_bench: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+	  --features=shortint,internal-keycache \
+		-p tfhe-benchmark -- --no-deps -D warnings

 .PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
 clippy_bench_gpu: install_rs_check_toolchain
@@ -1405,14 +1480,14 @@ bench_signed_integer: install_rs_check_toolchain

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
 bench_signed_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1428,14 +1503,14 @@ bench_integer_hpu: install_rs_check_toolchain

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1447,26 +1522,47 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain
 	--bench	glwe_packing_compression_128b-integer-bench \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_integer_zk_gpu
-bench_integer_zk_gpu: install_rs_check_toolchain
+.PHONY: bench_msm_zk
+bench_msm_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-msm \
+	--features=zk-pok -p tfhe-benchmark --profile release --
+
+.PHONY: bench_msm_zk_gpu
+bench_msm_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-msm \
+	--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release --
+
+.PHONY: bench_integer_zk_gpu
+bench_integer_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
+
+.PHONY: bench_integer_zk_experimental_gpu
+bench_integer_zk_experimental_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --

 .PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
 bench_integer_aes_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
 bench_integer_aes256_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes256 \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
 bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1475,6 +1571,13 @@ bench_integer_trivium_gpu: install_rs_check_toolchain
 	--bench integer-trivium \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

+.PHONY: bench_integer_kreyvium_gpu # Run benchmarks for kreyvium on GPU backend
+bench_integer_kreyvium_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-kreyvium \
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1509,7 +1612,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1655,11 +1758,18 @@ bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_unsafe_coop_firefox

-.PHONY: bench_hlapi # Run benchmarks for integer operations
-bench_hlapi: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
+.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
+bench_hlapi_unsigned: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi \
+	--bench hlapi_unsigned \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_signed # Run benchmarks for signed integer operations
+bench_hlapi_signed: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi_signed \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
@@ -1689,14 +1799,14 @@ bench_hlapi_erc20: install_rs_check_toolchain
 .PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
 bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
-    cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
 bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
-    cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

@@ -1735,6 +1845,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

+.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
+bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--package tfhe-zk-pok \
+	--features=gpu-experimental --profile release
+
 .PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
 bench_hlapi_noise_squash: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
@@ -1749,6 +1866,108 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

+.PHONY: bench_hlapi_kvstore # Run benchmarks for Key-Value Store operations
+bench_hlapi_kvstore: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-kvstore \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_summary # Run summary benchmarks
+bench_summary: install_rs_check_toolchain
+	# Arithmetic operations: addition, multiplication, division, comparison
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi_unsigned \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::add|::mul|::gt|::div_rem'
+
+	# Noise squash
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::noise_squash::'
+
+	# Noise squash and compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'
+
+	# ERC20
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
+
+	# DEX
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::no_cmux::'
+
+	# ZK
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,zk-pok,pbs-stats \
+	-p tfhe-benchmark --
+
+	# Compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-glwe_packing_compression \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_summary_gpu # Run summary benchmarks on GPU
+bench_summary_gpu: install_rs_check_toolchain
+	# Arithmetic operations: addition, multiplication, division, comparison
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=FAST_DEFAULT __TFHE_RS_BENCH_BIT_SIZES_SET=FAST __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::add|::mul|::gt|::div_rem'
+
+	# Noise squash
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::noise_squash::'
+
+	# Noise squash and compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
+
+	# ERC20
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
+
+	# DEX
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE)  __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::no_cmux::'
+
+	# ZK
+	# Proof is done on CPU node of the instance
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,zk-pok,pbs-stats \
+	-p tfhe-benchmark -- '::pke_zk_proof'
+	# Verify is done on GPUs
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --
+
+	# Compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-glwe_packing_compression \
+	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_custom # Run benchmarks with a user-defined command
 bench_custom: install_rs_check_toolchain
@@ -1875,7 +2094,7 @@ pcc_batch_1:
 pcc_batch_2:
 	$(call run_recipe_with_details,clippy)
 	$(call run_recipe_with_details,clippy_all_targets)
-	$(call run_recipe_with_details,check_fmt_js)
+	$(call run_recipe_with_details,check_fmt_js)  # This needs to stay there, CI pipeline rely on this recipe to conditionally install Node
 	$(call run_recipe_with_details,clippy_test_vectors)
 	$(call run_recipe_with_details,check_test_vectors)

--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <hr/>

 <p align="center">
-  <a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
+  <a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.org/tfhe-rs"> 📒 Documentation</a> | <a href="https://www.zama.org/community-channels"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
 </p>


@@ -47,7 +47,7 @@ production-ready library for all the advanced features of TFHE.
 - **Ciphertext and server key compression** for efficient data transfer
 - **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.

-*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
+*Learn more about TFHE-rs features in the [documentation](https://docs.zama.org/tfhe-rs).*
 <br></br>

 ## Table of Contents
@@ -149,7 +149,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performance possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.org/tfhe-rs/get-started/quick-start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -163,25 +163,25 @@ to run in release mode with cargo's `--release` flag to have the best performanc
 A document containing scientific and technical details about algorithms implemented into the library is available here: [TFHE-rs: A (Practical) Handbook](https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf).

 ### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
+- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.org/post/tfhe-deep-dive-part-1)
+- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.org/post/tfhe-deep-dive-part-2)
+- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.org/post/tfhe-deep-dive-part-3)
+- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.org/post/tfhe-deep-dive-part-4)
 <br></br>

 ### Tutorials
- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity-bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii-fhe-string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
+- [Video tutorial: Implement signed integers using TFHE-rs](https://www.zama.org/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
+- [Homomorphic parity bit](https://docs.zama.org/tfhe-rs/tutorials/parity-bit)
+- [Homomorphic case changing on Ascii string](https://docs.zama.org/tfhe-rs/tutorials/ascii-fhe-string)
+- [Boolean SHA256 with TFHE-rs](https://www.zama.org/post/boolean-sha256-tfhe-rs)
+- [Dark market with TFHE-rs](https://www.zama.org/post/dark-market-tfhe-rs)
+- [Regular expression engine with TFHE-rs](https://www.zama.org/post/regex-engine-tfhe-rs)

-*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
+*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.org/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
 <br></br>
 ### Documentation

-Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
+Full, comprehensive documentation is available here: [https://docs.zama.org/tfhe-rs](https://docs.zama.org/tfhe-rs).
 <p align="right">
  <a href="#about" > ↑ Back to top </a>
 </p>
@@ -202,7 +202,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
 ### Security model

 By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
-If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).
+If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.org/tfhe-rs).

 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf

@@ -231,7 +231,7 @@ To cite TFHE-rs in academic papers, please use the following entry:
 There are two ways to contribute to TFHE-rs:

 - [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
+- Request to become an official contributor by emailing [hello@zama.org](mailto:hello@zama.org).

 Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
 <br></br>
@@ -243,16 +243,16 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
 **Is Zama’s technology free to use?**
 >Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
 >
->Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
+>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.org/post/open-source).

 **What do I need to do if I want to use Zama’s technology for commercial purposes?**
->To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.ai for more information.
+>To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.org for more information.

 **Do you file IP on your technology?**
 >Yes, all Zama’s technologies are patented.

 **Can you customize a solution for my specific use case?**
->We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.
+>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.org.

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -261,7 +261,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi

 ## Support

-<a target="_blank" href="https://community.zama.ai">
+<a target="_blank" href="https://community.zama.org">
 <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -5,16 +5,16 @@ edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
 description = "Cuda implementation of TFHE-rs primitives."
-homepage = "https://www.zama.ai/"
-documentation = "https://docs.zama.ai/tfhe-rs"
+homepage = "https://www.zama.org/"
+documentation = "https://docs.zama.org/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

 [build-dependencies]
-cmake = { version = "0.1" }
-pkg-config = { version = "0.3" }
-bindgen = "0.71"
+cmake.workspace = true
+pkg-config.workspace = true
+bindgen.workspace = true

 [features]
 experimental-multi-arch = []
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -51,4 +51,4 @@ If your machine does not have an available Nvidia GPU, the compilation will work
 ## License

 This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.ai`.
+please contact us at `hello@zama.org`.
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -87,6 +87,7 @@ fn main() {
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
            "cuda/include/trivium/trivium.h",
+            "cuda/include/kreyvium/kreyvium.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
+++ b/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
@@ -0,0 +1,64 @@
+rules:
+  - id: release-missing-cuda-synchronize
+    message: >-
+      release() method does not call cuda_synchronize_stream or delegate to
+      another release(). All release methods must synchronize the CUDA stream
+      (directly or via delegation) to ensure async GPU operations complete
+      before returning.
+    severity: ERROR
+    languages: [cpp]
+    paths:
+      exclude:
+        - "**/helper_multi_gpu.h"
+    patterns:
+      - pattern: |
+          void release(...) {
+            ...
+          }
+      - pattern-not: |
+          void release(...) {
+            ...
+            cuda_synchronize_stream($S.stream(0), ...);
+            ...
+          }
+      - pattern-not: |
+          void release(cudaStream_t stream, ...) {
+            ...
+            cuda_synchronize_stream(stream, ...);
+            ...
+          }
+      - pattern-not: |
+          void release(...) {
+            ...
+            $MEM->release(...);
+            ...
+          }
+
+
+  - id: cleanup-missing-release-or-synchronize
+    message: >-
+      cleanup_ function does not call release() or cuda_synchronize_stream().
+      All non-async cleanup_ functions must either call release() on a memory
+      structure or synchronize the CUDA stream.
+    severity: ERROR
+    languages: [cpp]
+    patterns:
+      - pattern: |
+          void $FUNC(...) {
+            ...
+          }
+      - metavariable-regex:
+          metavariable: $FUNC
+          regex: ^cleanup_.*(?<!_async)$
+      - pattern-not: |
+          void $FUNC(...) {
+            ...
+            $MEM->release(...);
+            ...
+          }
+      - pattern-not: |
+          void $FUNC(...) {
+            ...
+            cuda_synchronize_stream(...);
+            ...
+          }
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
@@ -3,7 +3,7 @@
 #include "../integer/integer.h"

 extern "C" {
-uint64_t scratch_cuda_integer_aes_encrypt_64(
+uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -12,41 +12,28 @@ uint64_t scratch_cuda_integer_aes_encrypt_64(
    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
    uint32_t sbox_parallelism);

-void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
-                                     CudaRadixCiphertextFFI *output,
-                                     CudaRadixCiphertextFFI const *iv,
-                                     CudaRadixCiphertextFFI const *round_keys,
-                                     const uint64_t *counter_bits_le_all_blocks,
-                                     uint32_t num_aes_inputs, int8_t *mem_ptr,
-                                     void *const *bsks, void *const *ksks);
-
-void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
-                                         int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_key_expansion_64(
+uint64_t scratch_cuda_integer_aes_ctr_256_encrypt_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
+    uint32_t sbox_parallelism);

-void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
-                                   CudaRadixCiphertextFFI *expanded_keys,
-                                   CudaRadixCiphertextFFI const *key,
-                                   int8_t *mem_ptr, void *const *bsks,
-                                   void *const *ksks);
-
-void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void);
-
-void cuda_integer_aes_ctr_256_encrypt_64(
+void cuda_integer_aes_ctr_encrypt_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *iv, CudaRadixCiphertextFFI const *round_keys,
    const uint64_t *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks);

-uint64_t scratch_cuda_integer_key_expansion_256_64(
+void cleanup_cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
+                                             int8_t **mem_ptr_void);
+
+void cleanup_cuda_integer_aes_ctr_256_encrypt_64(CudaStreamsFFI streams,
+                                                 int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_key_expansion_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -54,11 +41,33 @@ uint64_t scratch_cuda_integer_key_expansion_256_64(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
-                                       CudaRadixCiphertextFFI *expanded_keys,
-                                       CudaRadixCiphertextFFI const *key,
-                                       int8_t *mem_ptr, void *const *bsks,
-                                       void *const *ksks);
+void cuda_integer_key_expansion_64_async(CudaStreamsFFI streams,
+                                         CudaRadixCiphertextFFI *expanded_keys,
+                                         CudaRadixCiphertextFFI const *key,
+                                         int8_t *mem_ptr, void *const *bsks,
+                                         void *const *ksks);
+
+void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void);
+
+void cuda_integer_aes_ctr_256_encrypt_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *iv, CudaRadixCiphertextFFI const *round_keys,
+    const uint64_t *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+
+uint64_t scratch_cuda_integer_key_expansion_256_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_integer_key_expansion_256_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *expanded_keys,
+    CudaRadixCiphertextFFI const *key, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks);

 void cleanup_cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
                                               int8_t **mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -29,15 +29,13 @@ template <typename Torus> struct int_aes_lut_buffers {
        allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return a & b; };
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
    auto active_streams_and_lut = streams.active_gpu_subset(
        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
        params.pbs_type);
-    this->and_lut->broadcast_lut(active_streams_and_lut);
+    this->and_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_and_lut, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
+
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->flush_lut = new int_radix_lut<Torus>(
@@ -46,14 +44,11 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
      return x & 1;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
    auto active_streams_flush_lut = streams.active_gpu_subset(
        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
-    this->flush_lut->broadcast_lut(active_streams_flush_lut);
+    this->flush_lut->generate_and_broadcast_lut(
+        active_streams_flush_lut, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -61,14 +56,11 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
      return (x >> 1) & 1;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
-        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_lambda, allocate_gpu_memory);
+
    auto active_streams_carry_lut =
        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
-    this->carry_lut->broadcast_lut(active_streams_carry_lut);
+    this->carry_lut->generate_and_broadcast_lut(
+        active_streams_carry_lut, {0}, {carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

@@ -187,11 +179,11 @@ template <typename Torus> struct int_aes_counter_workspaces {
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

    this->h_counter_bits_buffer =
-        (Torus *)malloc(num_aes_inputs * sizeof(Torus));
-    size_tracker += num_aes_inputs * sizeof(Torus);
+        (Torus *)malloc(safe_mul_sizeof<Torus>(num_aes_inputs));
+    size_tracker += safe_mul_sizeof<Torus>(num_aes_inputs);
    this->d_counter_bits_buffer = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_aes_inputs * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-        size_tracker, allocate_gpu_memory);
+        safe_mul_sizeof<Torus>(num_aes_inputs), streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
  }

  void release(CudaStreams streams, bool allocate_gpu_memory) {
--- a/backends/tfhe-cuda-backend/cuda/include/checked_arithmetic.h
+++ b/backends/tfhe-cuda-backend/cuda/include/checked_arithmetic.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdio>
+
+#include "device.h"
+
+// Variadic checked multiplication of size_t values.
+// Folds left-to-right using __builtin_mul_overflow, returning true on overflow.
+// On overflow the value written to *out is unspecified.
+template <typename... Args>
+inline bool checked_mul(size_t *out, size_t first, Args... rest) {
+  size_t result = first;
+  for (size_t value : {static_cast<size_t>(rest)...}) {
+    if (__builtin_mul_overflow(result, value, &result))
+      return true;
+  }
+  *out = result;
+  return false;
+}
+
+// Variadic safe multiplication: computes the product and panics on overflow.
+template <typename... Args> inline size_t safe_mul(size_t first, Args... rest) {
+  size_t result;
+  bool overflow = checked_mul(&result, first, rest...);
+  PANIC_IF_FALSE(!overflow, "multiplication overflow wraps size_t");
+  return result;
+}
+
+// Variadic safe multiplication with an appended sizeof(T) factor.
+// Computes (args... * sizeof(T)) with overflow checking.
+template <typename T, typename... Args>
+inline size_t safe_mul_sizeof(Args... args) {
+  return safe_mul(args..., sizeof(T));
+}
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -5,39 +5,36 @@

 extern "C" {

-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
-                                                  uint32_t gpu_index,
-                                                  void *dest, void const *src,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension);
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
-                                                  uint32_t gpu_index,
-                                                  void *dest, void const *src,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension);
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64_async(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t number_of_cts, uint32_t lwe_dimension);
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64_async(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t number_of_cts, uint32_t lwe_dimension);

-void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
-                                 void *lwe_array_out, void const *glwe_array_in,
-                                 uint32_t const *nth_array, uint32_t num_nths,
-                                 uint32_t lwe_per_glwe, uint32_t glwe_dimension,
-                                 uint32_t polynomial_size);
-
-void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
-                                    void *lwe_array_out, uint32_t size,
-                                    uint32_t log_modulus);
-
-void cuda_modulus_switch_64(void *stream, uint32_t gpu_index, void *lwe_out,
-                            const void *lwe_in, uint32_t size,
-                            uint32_t log_modulus);
-
-void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
-                                     void *lwe_out, const void *lwe_in,
-                                     uint32_t lwe_dimension,
-                                     uint32_t log_modulus);
-
-void cuda_glwe_sample_extract_128(
+void cuda_glwe_sample_extract_64_async(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
-    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
+    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
+    uint32_t glwe_dimension, uint32_t polynomial_size);
+
+void cuda_modulus_switch_inplace_64_async(void *stream, uint32_t gpu_index,
+                                          void *lwe_array_out, uint32_t size,
+                                          uint32_t log_modulus);
+
+void cuda_modulus_switch_64_async(void *stream, uint32_t gpu_index,
+                                  void *lwe_out, const void *lwe_in,
+                                  uint32_t size, uint32_t log_modulus);
+
+void cuda_centered_modulus_switch_64_async(void *stream, uint32_t gpu_index,
+                                           void *lwe_out, const void *lwe_in,
+                                           uint32_t lwe_dimension,
+                                           uint32_t log_modulus);
+
+void cuda_glwe_sample_extract_128_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
+    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
+    uint32_t glwe_dimension, uint32_t polynomial_size);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -132,6 +132,8 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);

 uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);

+uint32_t cuda_get_max_shared_memory_per_block(uint32_t gpu_index);
+
 bool cuda_check_support_cooperative_groups();

 bool cuda_check_support_thread_block_clusters();
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -10,11 +10,7 @@ extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
 extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
-extern "C" {
-int32_t cuda_setup_multi_gpu(int device_0_id);
-}
-
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
 // Define a variant type that can be either a vector or a single pointer
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,

 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type);
+uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
+                                   PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -80,7 +78,15 @@ public:
        _streams, _gpu_indexes,
        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }
-
+  // Returns a subset of this set as an active subset for pbs128. An active
+  // subset is one that is temporarily used to perform some computation. For
+  // pbs128, the threshold is different, because the original threshold was
+  // designed for 2_2 params.
+  CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
+  }
  // Returns a CudaStreams struct containing only the ith stream
  CudaStreams get_ith(int i) const {
    return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -144,9 +150,9 @@ public:
        _gpu_count(src._gpu_count), _owns_streams(false) {}

  CudaStreams &operator=(CudaStreams const &other) {
-    PANIC_IF_FALSE(this->_streams == nullptr ||
-                       this->_streams == other._streams,
-                   "Assigning an already initialized CudaStreams");
+    /*    PANIC_IF_FALSE(this->_streams == nullptr ||
+                           this->_streams == other._streams,
+                       "Assigning an already initialized CudaStreams");*/
    this->_streams = other._streams;
    this->_gpu_indexes = other._gpu_indexes;
    this->_gpu_count = other._gpu_count;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -45,12 +45,9 @@ template <typename Torus> struct boolean_bitop_buffer {

        // BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
        // only lut for degree = 1 is generated
-        generate_device_accumulator_bivariate_with_factor<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+        lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
+                                                  {lut_bivariate_f},
+                                                  LUT_0_FOR_ALL_BLOCKS, {}, 2);
      }
      break;
    default:
@@ -65,14 +62,8 @@ template <typename Torus> struct boolean_bitop_buffer {
        return x % params.message_modulus;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          message_extract_lut->get_lut(0, 0),
-          message_extract_lut->get_degree(0),
-          message_extract_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_message_extract, gpu_memory_allocated);
-      message_extract_lut->broadcast_lut(active_streams);
+      message_extract_lut->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -142,12 +133,8 @@ template <typename Torus> struct int_bitop_buffer {
          }
        };

-        generate_device_accumulator_bivariate<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+        lut->generate_and_broadcast_bivariate_lut(
+            active_streams, {0}, {lut_bivariate_f}, LUT_0_FOR_ALL_BLOCKS);
      }
      break;
    default:
@@ -156,6 +143,8 @@ template <typename Torus> struct int_bitop_buffer {
                                     num_radix_blocks, allocate_gpu_memory,
                                     size_tracker);

+      std::vector<std::function<Torus(Torus)>> lut_funcs;
+      std::vector<uint32_t> lut_indices;
      for (int i = 0; i < params.message_modulus; i++) {
        auto rhs = i;

@@ -171,14 +160,13 @@ template <typename Torus> struct int_bitop_buffer {
            return x ^ rhs;
          }
        };
-        generate_device_accumulator<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
-            lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_univariate_scalar_f,
-            gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+
+        lut_funcs.push_back(lut_univariate_scalar_f);
+        lut_indices.push_back(i);
      }
+
+      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
+                                      LUT_0_FOR_ALL_BLOCKS);
    }
  }

@@ -211,16 +199,11 @@ template <typename Torus> struct boolean_bitnot_buffer {
        return x % message_modulus;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          message_extract_lut->get_lut(0, 0),
-          message_extract_lut->get_degree(0),
-          message_extract_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_message_extract, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
-      message_extract_lut->broadcast_lut(active_streams);
+
+      message_extract_lut->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -28,20 +28,16 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      uint32_t bits_per_block = std::log2(params.message_modulus);
      uint32_t msg_modulus = params.message_modulus;

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          [msg_modulus, bits_per_block](Torus x) {
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+
+      lut->generate_and_broadcast_lut(
+          active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
            const auto xm = x % msg_modulus;
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
-          },
-          allocate_gpu_memory);
-
-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      lut->broadcast_lut(active_streams);
+          }},
+          LUT_0_FOR_ALL_BLOCKS);

      this->last_block = new CudaRadixCiphertextFFI;

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -85,42 +85,28 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
-        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
-        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
-        message_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        message_extract_lut_f, gpu_memory_allocated);
-    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
-    for (int index = 0; index < 2 * num_radix_blocks; index++) {
-      if (index < num_radix_blocks) {
-        h_lut_indexes[index] = 0;
-      } else {
-        h_lut_indexes[index] = 1;
-      }
-    }
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
-        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams_pred =
        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    predicate_lut->broadcast_lut(active_streams_pred);
+    auto lut_index_generator = [num_radix_blocks](Torus *h_lut_indexes,
+                                                  uint32_t num_indexes) {
+      for (int index = 0; index < 2 * num_radix_blocks; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+    };
+
+    predicate_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
+        lut_index_generator);
+
    auto active_streams_msg =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    message_extract_lut->broadcast_lut(active_streams_msg);
+
+    message_extract_lut->generate_and_broadcast_lut(
+        active_streams_msg, {0}, {message_extract_lut_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "checked_arithmetic.h"
 #include "cmux.h"
 #include "integer_utilities.h"

@@ -28,7 +29,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    Torus total_modulus = params.message_modulus * params.carry_modulus;
    uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

-    int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
+    int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
    tmp_out = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
@@ -39,22 +40,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

+    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
+        params.glwe_dimension + 1, params.polynomial_size));
+
    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
                                            allocate_gpu_memory, size_tracker);
-    auto is_max_value_f = [max_value](Torus x) -> Torus {
-      return x == max_value;
-    };
-    preallocated_h_lut = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
-        is_max_value->get_degree(0), is_max_value->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

    auto active_streams =
        streams.active_gpu_subset(max_chunks, params.pbs_type);
-    is_max_value->broadcast_lut(active_streams);
+
+    auto is_max_value_f = [max_value](Torus x) -> Torus {
+      return x == max_value;
+    };
+
+    is_max_value->generate_and_broadcast_lut(
+        active_streams, {0}, {is_max_value_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
@@ -103,15 +103,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
-        is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
-
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    is_non_zero_lut->broadcast_lut(active_streams);
+    is_non_zero_lut->generate_and_broadcast_lut(
+        active_streams, {0}, {is_non_zero_lut_f}, LUT_0_FOR_ALL_BLOCKS);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -129,32 +124,27 @@ template <typename Torus> struct int_comparison_eq_buffer {
        return (lhs == rhs);
      }
    };
+
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < total_modulus; i++) {
      auto lut_f = [i, operator_f](Torus x) -> Torus {
        return operator_f(i, x);
      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          scalar_comparison_luts->get_lut(0, i),
-          scalar_comparison_luts->get_degree(i),
-          scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f, gpu_memory_allocated);
+      lut_funcs.push_back(lut_f);
+      lut_indices.push_back(i);
    }
-    scalar_comparison_luts->broadcast_lut(active_streams);
+
+    scalar_comparison_luts->generate_and_broadcast_lut(
+        active_streams, lut_indices, lut_funcs, LUT_0_FOR_ALL_BLOCKS);
+
    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
-          operator_lut->get_degree(0), operator_lut->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, operator_f, gpu_memory_allocated);
-
-      operator_lut->broadcast_lut(active_streams);
+      operator_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {operator_f}, LUT_0_FOR_ALL_BLOCKS);
    } else {
      operator_lut = nullptr;
    }
@@ -221,28 +211,24 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    // LUTs
-    tree_inner_leaf_lut =
-        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
-                                 allocate_gpu_memory, size_tracker);

    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    preallocated_h_lut = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
+        params.glwe_dimension + 1, params.polynomial_size));

    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
-        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        block_selector_f, gpu_memory_allocated);
+    tree_inner_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
+                                 allocate_gpu_memory, size_tracker);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    tree_inner_leaf_lut->broadcast_lut(active_streams);
+    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {block_selector_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
@@ -322,10 +308,10 @@ template <typename Torus> struct int_comparison_diff_buffer {
    reduce_signs_lut =
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);
-    preallocated_h_lut1 = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-    preallocated_h_lut2 = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    preallocated_h_lut1 = (Torus *)malloc(safe_mul_sizeof<Torus>(
+        params.glwe_dimension + 1, params.polynomial_size));
+    preallocated_h_lut2 = (Torus *)malloc(safe_mul_sizeof<Torus>(
+        params.glwe_dimension + 1, params.polynomial_size));
  }

  void release(CudaStreams streams) {
@@ -426,12 +412,8 @@ template <typename Torus> struct int_comparison_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
-        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
-    identity_lut->broadcast_lut(active_streams);
+    identity_lut->generate_and_broadcast_lut(
+        active_streams, {0}, {identity_lut_f}, LUT_0_FOR_ALL_BLOCKS);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -441,13 +423,8 @@ template <typename Torus> struct int_comparison_buffer {
    is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                           allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
-        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_zero_f, gpu_memory_allocated);
-
-    is_zero_lut->broadcast_lut(active_streams);
+    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
+                                            LUT_0_FOR_ALL_BLOCKS);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -522,16 +499,12 @@ template <typename Torus> struct int_comparison_buffer {
        PANIC("Cuda error: sign_lut creation failed due to wrong function.")
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
-          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      signed_lut->broadcast_lut(active_streams);
+      signed_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {signed_lut_f}, LUT_0_FOR_ALL_BLOCKS);
    }
-    preallocated_h_lut = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
+        params.glwe_dimension + 1, params.polynomial_size));
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -5,14 +5,15 @@
 #include "../integer.h"

 extern "C" {
-uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
+uint64_t scratch_cuda_integer_compress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t num_lwes_stored_per_glwe,
+    bool allocate_gpu_memory);

-uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
+uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
@@ -21,12 +22,12 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_compress_radix_ciphertext_64(
+void cuda_integer_compress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
    int8_t *mem_ptr);

-void cuda_integer_decompress_radix_ciphertext_64(
+void cuda_integer_decompress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_in,
    uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
@@ -37,25 +38,26 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
                                                         int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
+uint64_t scratch_cuda_integer_compress_radix_ciphertext_128_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t num_lwes_stored_per_glwe,
+    bool allocate_gpu_memory);

-uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
+uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, bool allocate_gpu_memory);

-void cuda_integer_compress_radix_ciphertext_128(
+void cuda_integer_compress_radix_ciphertext_128_async(
    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
    int8_t *mem_ptr);

-void cuda_integer_decompress_radix_ciphertext_128(
+void cuda_integer_decompress_radix_ciphertext_128_async(
    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_in,
    uint32_t const *indexes_array, int8_t *mem_ptr);
@@ -66,12 +68,12 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

-void cuda_integer_extract_glwe_128(
+void cuda_integer_extract_glwe_128_async(
    CudaStreamsFFI streams, void *glwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_list,
    uint32_t const glwe_index);

-void cuda_integer_extract_glwe_64(
+void cuda_integer_extract_glwe_64_async(
    CudaStreamsFFI streams, void *glwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_list,
    uint32_t const glwe_index);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -10,25 +10,37 @@ template <typename Torus> struct int_compression {
  Torus *tmp_lwe;
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
-  uint32_t lwe_per_glwe;
+  uint32_t num_lwes_stored_per_glwe;
+  uint32_t max_num_glwes;

+  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
+  // compress num_lwes_stored_per_glwe: max LWEs packed per GLWE (<=
+  // polynomial_size), defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
-                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t num_radix_blocks, uint32_t num_lwes_stored_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
+    this->num_lwes_stored_per_glwe = num_lwes_stored_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

+    // Calculate the actual number of GLWEs needed based on total radix blocks.
+    // This ensures we allocate enough memory when num_radix_blocks >
+    // num_lwes_stored_per_glwe.
+    max_num_glwes = CEIL_DIV(num_radix_blocks, num_lwes_stored_per_glwe);
+
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
-            sizeof(Torus),
+        safe_mul_sizeof<Torus>(
+            (size_t)num_radix_blocks,
+            (size_t)(compression_params.small_lwe_dimension + 1)),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
+            safe_mul_sizeof<Torus>((size_t)max_num_glwes,
+                                   glwe_accumulator_size),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

@@ -41,12 +53,21 @@ template <typename Torus> struct int_compression {
  void release(CudaStreams streams) {
    cuda_drop_with_size_tracking_async(
        tmp_lwe, streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
+    tmp_lwe = nullptr;
+
    cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    cleanup_packing_keyswitch_lwe_list_to_glwe(
-        streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
-        gpu_memory_allocated);
+    tmp_glwe_array_out = nullptr;
+
+    if constexpr (sizeof(Torus) == 8)
+      cleanup_cuda_packing_keyswitch_lwe_list_to_glwe_64(
+          streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
+          gpu_memory_allocated);
+    else
+      cleanup_cuda_packing_keyswitch_lwe_list_to_glwe_128(
+          streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
+          gpu_memory_allocated);
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };
@@ -79,14 +100,17 @@ template <typename Torus> struct int_decompression {
                                     1);

    tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
+        safe_mul_sizeof<Torus>((size_t)num_blocks_to_decompress,
+                               glwe_accumulator_size),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);
    tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
-        num_blocks_to_decompress * sizeof(uint32_t), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+        safe_mul_sizeof<uint32_t>((size_t)num_blocks_to_decompress),
+        streams.stream(0), streams.gpu_index(0), size_tracker,
+        allocate_gpu_memory);
    tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
+        safe_mul_sizeof<Torus>((size_t)num_blocks_to_decompress,
+                               lwe_accumulator_size),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

@@ -106,31 +130,30 @@ template <typename Torus> struct int_decompression {
          encryption_params.carry_modulus;
      auto effective_compression_carry_modulus = 1;

-      generate_device_accumulator_with_encoding<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          decompression_rescale_lut->get_lut(0, 0),
-          decompression_rescale_lut->get_degree(0),
-          decompression_rescale_lut->get_max_degree(0),
-          encryption_params.glwe_dimension, encryption_params.polynomial_size,
-          effective_compression_message_modulus,
-          effective_compression_carry_modulus,
-          encryption_params.message_modulus, encryption_params.carry_modulus,
-          decompression_rescale_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(
          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
-      decompression_rescale_lut->broadcast_lut(active_streams);
+      decompression_rescale_lut->generate_and_broadcast_lut_with_encoding(
+          active_streams, {0}, {decompression_rescale_f},
+          effective_compression_message_modulus,
+          effective_compression_carry_modulus,
+          encryption_params.message_modulus, encryption_params.carry_modulus);
    }
  }
  void release(CudaStreams streams) {
    cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+
+    tmp_extracted_glwe = nullptr;
    cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    tmp_extracted_lwe = nullptr;
    cuda_drop_with_size_tracking_async(tmp_indexes_array, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    tmp_indexes_array = nullptr;
+
    if constexpr (std::is_same_v<Torus, uint64_t>) {
      decompression_rescale_lut->release(streams);
      delete decompression_rescale_lut;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -283,12 +283,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                     zero_out_if_not_1_lut_2};
    size_t lut_gpu_indexes[2] = {0, 3};
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(lut_gpu_indexes[j]),
-          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
+      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
+                                          {0}, {zero_out_if_not_1_lut_f},
+                                          LUT_0_FOR_ALL_BLOCKS);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -296,12 +293,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    lut_gpu_indexes[0] = 1;
    lut_gpu_indexes[1] = 2;
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(lut_gpu_indexes[j]),
-          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
+      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
+                                          {0}, {zero_out_if_not_2_lut_f},
+                                          LUT_0_FOR_ALL_BLOCKS);
    }

    quotient_lut_1 =
@@ -321,21 +315,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    };
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

-    generate_device_accumulator<Torus>(
-        streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
-        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
-        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
-        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
+    quotient_lut_1->generate_and_broadcast_lut(
+        streams.get_ith(2), {0}, {quotient_lut_1_f}, LUT_0_FOR_ALL_BLOCKS);
+    quotient_lut_2->generate_and_broadcast_lut(
+        streams.get_ith(1), {0}, {quotient_lut_2_f}, LUT_0_FOR_ALL_BLOCKS);
+    quotient_lut_3->generate_and_broadcast_lut(
+        streams.get_ith(0), {0}, {quotient_lut_3_f}, LUT_0_FOR_ALL_BLOCKS);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -350,15 +335,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    luts[0] = message_extract_lut_1;
    luts[1] = message_extract_lut_2;

+    auto active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
+
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams =
-          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      luts[j]->broadcast_lut(active_streams);
+      luts[j]->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }
  }

@@ -451,30 +433,31 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
      Torus ***second_indexes_ptr, Torus ***scalars_ptr, uint32_t num_blocks,
      bool allocate_gpu_memory, uint64_t &size_tracker) {

-    auto first_indexes = (Torus **)malloc(num_blocks * sizeof(Torus *));
-    auto second_indexes = (Torus **)malloc(num_blocks * sizeof(Torus *));
-    auto scalars = (Torus **)malloc(num_blocks * sizeof(Torus *));
+    auto first_indexes = (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+    auto second_indexes =
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+    auto scalars = (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));

    for (int nb = 1; nb <= num_blocks; nb++) {
      first_indexes[nb - 1] = (Torus *)cuda_malloc_with_size_tracking_async(
-          nb * sizeof(Torus), stream, gpu_index, size_tracker,
+          safe_mul_sizeof<Torus>(nb), stream, gpu_index, size_tracker,
          allocate_gpu_memory);
      second_indexes[nb - 1] = (Torus *)cuda_malloc_with_size_tracking_async(
-          nb * sizeof(Torus), stream, gpu_index, size_tracker,
+          safe_mul_sizeof<Torus>(nb), stream, gpu_index, size_tracker,
          allocate_gpu_memory);
      scalars[nb - 1] = (Torus *)cuda_malloc_with_size_tracking_async(
-          nb * sizeof(Torus), stream, gpu_index, size_tracker,
+          safe_mul_sizeof<Torus>(nb), stream, gpu_index, size_tracker,
          allocate_gpu_memory);

      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          first_indexes[nb - 1], first_indexes_for_overflow_sub_gpu_0[nb - 1],
-          nb * sizeof(Torus), stream, gpu_index, allocate_gpu_memory);
+          safe_mul_sizeof<Torus>(nb), stream, gpu_index, allocate_gpu_memory);
      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          second_indexes[nb - 1], second_indexes_for_overflow_sub_gpu_0[nb - 1],
-          nb * sizeof(Torus), stream, gpu_index, allocate_gpu_memory);
+          safe_mul_sizeof<Torus>(nb), stream, gpu_index, allocate_gpu_memory);
      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          scalars[nb - 1], scalars_for_overflow_sub_gpu_0[nb - 1],
-          nb * sizeof(Torus), stream, gpu_index, allocate_gpu_memory);
+          safe_mul_sizeof<Torus>(nb), stream, gpu_index, allocate_gpu_memory);
      *first_indexes_ptr = first_indexes;
      *second_indexes_ptr = second_indexes;
      *scalars_ptr = scalars;
@@ -488,72 +471,91 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    max_indexes_to_erase = num_blocks;

    first_indexes_for_overflow_sub_gpu_0 =
-        (Torus **)malloc(num_blocks * sizeof(Torus *));
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
    second_indexes_for_overflow_sub_gpu_0 =
-        (Torus **)malloc(num_blocks * sizeof(Torus *));
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
    scalars_for_overflow_sub_gpu_0 =
-        (Torus **)malloc(num_blocks * sizeof(Torus *));
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));

-    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
-    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_lut_indexes = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));
+    Torus *h_scalar = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));

    // Extra indexes for the luts in first step
    for (int nb = 1; nb <= num_blocks; nb++) {
      first_indexes_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-              size_tracker, allocate_gpu_memory);
-      for (int index = 0; index < nb; index++) {
-        uint32_t grouping_index = index / group_size;
-        bool is_in_first_grouping = (grouping_index == 0);
-        uint32_t index_in_grouping = index % group_size;
-        bool is_last_index = (index == (nb - 1));
-        if (is_last_index) {
-          if (nb == 1) {
-            h_lut_indexes[index] = 2 * group_size;
+              safe_mul_sizeof<Torus>(nb), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+
+      auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+          bool is_last_index = (index == (nb - 1));
+          if (is_last_index) {
+            if (nb == 1) {
+              h_lut_indexes[index] = 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2;
+            }
+          } else if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
          } else {
-            h_lut_indexes[index] = 2;
+            h_lut_indexes[index] = index_in_grouping + group_size;
          }
-        } else if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
        }
-      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          first_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-          allocate_gpu_memory);
+      };
+
+      generate_lut_indexes<Torus>(streams, index_generator,
+                                  first_indexes_for_overflow_sub_gpu_0[nb - 1],
+                                  nb, 2 * group_size + 1, h_lut_indexes,
+                                  allocate_gpu_memory);
    }
    // Extra indexes for the luts in second step
+    uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
+    uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
    for (int nb = 1; nb <= num_blocks; nb++) {
      second_indexes_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-              size_tracker, allocate_gpu_memory);
+              safe_mul_sizeof<Torus>(nb), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
      scalars_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-              size_tracker, allocate_gpu_memory);
+              safe_mul_sizeof<Torus>(nb), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+
+      auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
+                                                       uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+
+          if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
+          } else if (index_in_grouping == (group_size - 1)) {
+            if (use_seq) {
+              int inner_index = (grouping_index - 1) % (group_size - 1);
+              h_lut_indexes[index] = inner_index + 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2 * group_size;
+            }
+          } else {
+            h_lut_indexes[index] = index_in_grouping + group_size;
+          }
+        }
+      };
+
+      generate_lut_indexes<Torus>(streams, index_generator,
+                                  second_indexes_for_overflow_sub_gpu_0[nb - 1],
+                                  nb, num_luts_second_step, h_lut_indexes,
+                                  allocate_gpu_memory);

      for (int index = 0; index < nb; index++) {
        uint32_t grouping_index = index / group_size;
        bool is_in_first_grouping = (grouping_index == 0);
        uint32_t index_in_grouping = index % group_size;
-
-        if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else if (index_in_grouping == (group_size - 1)) {
-          if (use_seq) {
-            int inner_index = (grouping_index - 1) % (group_size - 1);
-            h_lut_indexes[index] = inner_index + 2 * group_size;
-          } else {
-            h_lut_indexes[index] = 2 * group_size;
-          }
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
-        }
-
        bool may_have_its_padding_bit_set =
            !is_in_first_grouping && (index_in_grouping == group_size - 1);

@@ -568,12 +570,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
        }
      }
      cuda_memcpy_with_size_tracking_async_to_gpu(
-          second_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+          scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar,
+          safe_mul_sizeof<Torus>(nb), streams.stream(0), streams.gpu_index(0),
          allocate_gpu_memory);
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar, nb * sizeof(Torus),
-          streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    }
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_lut_indexes);
@@ -1007,24 +1006,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
-          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
-      masking_luts_1[i]->broadcast_lut(active_streams_1);
+      masking_luts_1[i]->generate_and_broadcast_lut(
+          active_streams_1, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
-          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
      auto active_streams_2 =
          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      masking_luts_2[i]->broadcast_lut(active_streams_2);
+      masking_luts_2[i]->generate_and_broadcast_lut(
+          active_streams_2, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1042,15 +1031,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
+
    auto active_streams =
        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      luts[j]->broadcast_lut(active_streams);
+      luts[j]->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }

    // Give name to closures to improve readability
@@ -1076,24 +1062,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0),
-        zero_out_if_overflow_did_not_happen[0]->get_degree(0),
-        zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
-        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(active_streams);
-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
-        zero_out_if_overflow_did_not_happen[1]->get_degree(0),
-        zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
-        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);
+    zero_out_if_overflow_did_not_happen[0]
+        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
+                                               LUT_0_FOR_ALL_BLOCKS, {},
+                                               params.message_modulus - 2);
+    zero_out_if_overflow_did_not_happen[1]
+        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
+                                               LUT_0_FOR_ALL_BLOCKS, {},
+                                               params.message_modulus - 1);

    // create and generate zero_out_if_overflow_happened
    zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -1110,24 +1086,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_happened[0]->get_lut(0, 0),
-        zero_out_if_overflow_happened[0]->get_degree(0),
-        zero_out_if_overflow_happened[0]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
-        gpu_memory_allocated);
-    zero_out_if_overflow_happened[0]->broadcast_lut(active_streams);
-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_happened[1]->get_lut(0, 0),
-        zero_out_if_overflow_happened[1]->get_degree(0),
-        zero_out_if_overflow_happened[1]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
-        gpu_memory_allocated);
-    zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);
+    zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
+        params.message_modulus - 2);
+    zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
+        params.message_modulus - 1);

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
@@ -1141,14 +1105,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          merge_overflow_flags_luts[i]->get_lut(0, 0),
-          merge_overflow_flags_luts[i]->get_degree(0),
-          merge_overflow_flags_luts[i]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
-      merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
+      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
+          active_gpu_count_for_bits, {0}, {lut_f_bit}, LUT_0_FOR_ALL_BLOCKS);
    }
  }

@@ -1206,71 +1164,89 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
    max_indexes_to_erase = num_blocks;

    first_indexes_for_overflow_sub =
-        (Torus **)malloc(num_blocks * sizeof(Torus *));
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
    second_indexes_for_overflow_sub =
-        (Torus **)malloc(num_blocks * sizeof(Torus *));
-    scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));
+    scalars_for_overflow_sub =
+        (Torus **)malloc(safe_mul_sizeof<Torus *>(num_blocks));

-    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
-    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_lut_indexes = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));
+    Torus *h_scalar = (Torus *)malloc(safe_mul_sizeof<Torus>(num_blocks));

    // Extra indexes for the luts in first step
    for (int nb = 1; nb <= num_blocks; nb++) {
      first_indexes_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-              size_tracker, allocate_gpu_memory);
-      for (int index = 0; index < nb; index++) {
-        uint32_t grouping_index = index / group_size;
-        bool is_in_first_grouping = (grouping_index == 0);
-        uint32_t index_in_grouping = index % group_size;
-        bool is_last_index = (index == (nb - 1));
-        if (is_last_index) {
-          if (nb == 1) {
-            h_lut_indexes[index] = 2 * group_size;
+              safe_mul_sizeof<Torus>(nb), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+
+      auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+          bool is_last_index = (index == (nb - 1));
+          if (is_last_index) {
+            if (nb == 1) {
+              h_lut_indexes[index] = 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2;
+            }
+          } else if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
          } else {
-            h_lut_indexes[index] = 2;
+            h_lut_indexes[index] = index_in_grouping + group_size;
          }
-        } else if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
        }
-      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-          allocate_gpu_memory);
+      };
+
+      generate_lut_indexes<Torus>(
+          streams, index_generator, first_indexes_for_overflow_sub[nb - 1], nb,
+          2 * group_size + 1, h_lut_indexes, allocate_gpu_memory);
    }
    // Extra indexes for the luts in second step
+    uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
+    uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
    for (int nb = 1; nb <= num_blocks; nb++) {
      second_indexes_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-              size_tracker, allocate_gpu_memory);
+              safe_mul_sizeof<Torus>(nb), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
      scalars_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
-              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-              size_tracker, allocate_gpu_memory);
+              safe_mul_sizeof<Torus>(nb), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory);
+
+      auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
+                                                       uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+
+          if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
+          } else if (index_in_grouping == (group_size - 1)) {
+            if (use_seq) {
+              int inner_index = (grouping_index - 1) % (group_size - 1);
+              h_lut_indexes[index] = inner_index + 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2 * group_size;
+            }
+          } else {
+            h_lut_indexes[index] = index_in_grouping + group_size;
+          }
+        }
+      };
+
+      generate_lut_indexes<Torus>(
+          streams, index_generator, second_indexes_for_overflow_sub[nb - 1], nb,
+          num_luts_second_step, h_lut_indexes, allocate_gpu_memory);

      for (int index = 0; index < nb; index++) {
        uint32_t grouping_index = index / group_size;
        bool is_in_first_grouping = (grouping_index == 0);
        uint32_t index_in_grouping = index % group_size;
-
-        if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else if (index_in_grouping == (group_size - 1)) {
-          if (use_seq) {
-            int inner_index = (grouping_index - 1) % (group_size - 1);
-            h_lut_indexes[index] = inner_index + 2 * group_size;
-          } else {
-            h_lut_indexes[index] = 2 * group_size;
-          }
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
-        }
-
        bool may_have_its_padding_bit_set =
            !is_in_first_grouping && (index_in_grouping == group_size - 1);

@@ -1285,12 +1261,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        }
      }
      cuda_memcpy_with_size_tracking_async_to_gpu(
-          second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+          scalars_for_overflow_sub[nb - 1], h_scalar,
+          safe_mul_sizeof<Torus>(nb), streams.stream(0), streams.gpu_index(0),
          allocate_gpu_memory);
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
-          streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    }
    free(h_lut_indexes);
    free(h_scalar);
@@ -1557,16 +1530,12 @@ template <typename Torus> struct int_div_rem_memory {
      compare_signed_bits_lut = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          compare_signed_bits_lut->get_lut(0, 0),
-          compare_signed_bits_lut->get_degree(0),
-          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
-      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
+
+      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
+          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
+          LUT_0_FOR_ALL_BLOCKS);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -53,13 +53,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return count;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
-        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
-
-    univ_lut_mem->broadcast_lut(active_streams);
+    univ_lut_mem->generate_and_broadcast_lut(
+        active_streams, {0}, {generate_uni_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -70,13 +65,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return 0;
    };

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
-        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
-
-    biv_lut_mem->broadcast_lut(active_streams);
+    biv_lut_mem->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {generate_bi_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -232,7 +222,7 @@ template <typename Torus> struct int_ilog2_buffer {
        this->sum_output_not_propagated, counter_num_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

-    this->lut_message_not =
+    lut_message_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
                                 allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus)> lut_message_lambda =
@@ -240,16 +230,11 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t message = x % this->params.message_modulus;
      return (~message) % this->params.message_modulus;
    };
-    generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
-                                this->lut_message_not->get_lut(0, 0),
-                                this->lut_message_not->get_degree(0),
-                                this->lut_message_not->get_max_degree(0),
-                                params.glwe_dimension, params.polynomial_size,
-                                params.message_modulus, params.carry_modulus,
-                                lut_message_lambda, allocate_gpu_memory);
+
    auto active_streams =
        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
-    lut_message_not->broadcast_lut(active_streams);
+    lut_message_not->generate_and_broadcast_lut(
+        active_streams, {0}, {lut_message_lambda}, LUT_0_FOR_ALL_BLOCKS);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -259,13 +244,8 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t carry = x / this->params.message_modulus;
      return (~carry) % this->params.message_modulus;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0),
-        this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
-        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        lut_carry_lambda, allocate_gpu_memory);
-    lut_carry_not->broadcast_lut(active_streams);
+    lut_carry_not->generate_and_broadcast_lut(
+        active_streams, {0}, {lut_carry_lambda}, LUT_0_FOR_ALL_BLOCKS);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -37,17 +37,12 @@ template <typename Torus> struct int_mul_memory {
      zero_out_predicate_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          zero_out_predicate_lut->get_lut(0, 0),
-          zero_out_predicate_lut->get_degree(0),
-          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          zero_out_predicate_lut_f, gpu_memory_allocated);

      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      zero_out_predicate_lut->broadcast_lut(active_streams);
+      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {zero_out_predicate_lut_f},
+          LUT_0_FOR_ALL_BLOCKS);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -55,10 +50,7 @@ template <typename Torus> struct int_mul_memory {
      return;
    }

-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    // 'vector_result_lsb' contains blocks from all possible shifts of
    // radix_lwe_left excluding zero ciphertext blocks
@@ -70,6 +62,10 @@ template <typename Torus> struct int_mul_memory {

    int total_block_count = num_radix_blocks * num_radix_blocks;

+    GPU_ASSERT(lsb_vector_block_count + msb_vector_block_count ==
+                   total_block_count,
+               "MSB and LSB vector block counts don't match");
+
    // allocate memory for intermediate buffers
    vector_result_sb = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -91,8 +87,6 @@ template <typename Torus> struct int_mul_memory {
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
                                          allocate_gpu_memory, size_tracker);
-    auto lsb_acc = luts_array->get_lut(0, 0);
-    auto msb_acc = luts_array->get_lut(0, 1);

    // define functions for each accumulator
    auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -102,30 +96,21 @@ template <typename Torus> struct int_mul_memory {
      return (x * y) / message_modulus;
    };

-    // generate accumulators
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), lsb_acc,
-        luts_array->get_degree(0), luts_array->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        lut_f_lsb, gpu_memory_allocated);
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), msb_acc,
-        luts_array->get_degree(1), luts_array->get_max_degree(1),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        lut_f_msb, gpu_memory_allocated);
-
    // lut_indexes_vec for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
    // for message and carry default lut_indexes_vec is fine
-    if (allocate_gpu_memory)
-      cuda_set_value_async<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
-          msb_vector_block_count);
    auto active_streams =
        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    luts_array->broadcast_lut(active_streams);
+    auto lut_index_generator = [lsb_vector_block_count](Torus *h_lut_indexes,
+                                                        uint32_t num_indexes) {
+      for (uint32_t i = 0; i < num_indexes; i++) {
+        h_lut_indexes[i] = (i < lsb_vector_block_count) ? 0 : 1;
+      }
+    };
+    luts_array->generate_and_broadcast_bivariate_lut(
+        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, lut_index_generator);
+
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, params, num_radix_blocks, 2 * num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
    uint32_t calculated_active_blocks =
        total_random_bits == 0
            ? 0
-            : (total_random_bits + message_bits_per_block - 1) /
-                  message_bits_per_block;
+            : CEIL_DIV(total_random_bits, message_bits_per_block);
    if (num_blocks_to_process != calculated_active_blocks) {
      PANIC(
          "num_blocks_to_process should be equal to calculated_active_blocks");
@@ -53,6 +52,10 @@ template <typename Torus> struct int_grouped_oprf_memory {

    // Pre-generate all possible LUTs.
    //
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
+    std::vector<uint64_t> lut_degrees;
+
    for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
         ++random_bit) {
      uint64_t p = 1ULL << random_bit;
@@ -70,23 +73,23 @@ template <typename Torus> struct int_grouped_oprf_memory {

      uint64_t degree = 0;
      uint32_t lut_index = random_bit - 1;
-      generate_device_accumulator_no_encoding<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts->get_lut(0, lut_index),
-          degree, params.message_modulus, params.carry_modulus,
-          params.glwe_dimension, params.polynomial_size, lut_f,
-          allocate_gpu_memory);
+
+      lut_funcs.push_back(lut_f);
+      lut_indices.push_back(lut_index);
+
      // In  OPRF the degree is hard set to p - 1 instead of the LUT degree
      degree = p - 1;
-      *luts->get_degree(lut_index) = degree;
+      lut_degrees.push_back(degree);
    }

    // For each block, this loop determines the exact number of bits to generate
    // (handling both bounded and unbounded cases), which pre-computed LUT to
    // use, and the final plaintext correction to add.
    //
-    Torus *h_corrections =
-        (Torus *)calloc(num_blocks_to_process * lwe_size, sizeof(Torus));
-    this->h_lut_indexes = (Torus *)calloc(num_blocks_to_process, sizeof(Torus));
+    Torus *h_corrections = (Torus *)calloc(
+        1, safe_mul_sizeof<Torus>(num_blocks_to_process, lwe_size));
+    this->h_lut_indexes =
+        (Torus *)calloc(1, safe_mul_sizeof<Torus>(num_blocks_to_process));

    uint64_t bits_processed = 0;
    for (uint32_t i = 0; i < num_blocks_to_process; ++i) {
@@ -102,10 +105,6 @@ template <typename Torus> struct int_grouped_oprf_memory {
      Torus plaintext_to_add = (p - 1) * delta / 2;

      h_corrections[i * lwe_size + params.big_lwe_dimension] = plaintext_to_add;
-      if (bits_for_this_block < 1) {
-        PANIC("bits_for_this_block should be greater than 1");
-      }
-      this->h_lut_indexes[i] = bits_for_this_block - 1;

      bits_processed += bits_for_this_block;
    }
@@ -117,18 +116,40 @@ template <typename Torus> struct int_grouped_oprf_memory {
    // Copy the prepared plaintext corrections to the GPU.
    cuda_memcpy_with_size_tracking_async_to_gpu(
        this->plaintext_corrections->ptr, h_corrections,
-        num_blocks_to_process * lwe_size * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
+        safe_mul_sizeof<Torus>(num_blocks_to_process, lwe_size),
+        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

    // Copy the prepared LUT indexes to the GPU 0, before broadcast to all other
    // GPUs.
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        luts->get_lut_indexes(0, 0), this->h_lut_indexes,
-        num_blocks_to_process * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
-    luts->broadcast_lut(active_streams);
+    // No encoding for these LUTS. Generate LUT also sets LUT degrees to default
+    // values
+    auto luts_index_generator = [total_random_bits, message_bits_per_block](
+                                    Torus *h_lut_indexes, uint32_t num_blocks) {
+      uint64_t bits_processed = 0;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        if (total_random_bits <= bits_processed) {
+          PANIC("total_random_bits should be greater than bits_processed");
+        }
+        uint64_t bits_remaining = total_random_bits - bits_processed;
+        uint32_t bits_for_this_block =
+            std::min((uint64_t)message_bits_per_block, bits_remaining);
+        if (bits_for_this_block < 1) {
+          PANIC("bits_for_this_block should be greater than 1");
+        }
+        h_lut_indexes[i] = bits_for_this_block - 1;
+        bits_processed += bits_for_this_block;
+      }
+    };
+    luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
+                                     luts_index_generator, false, {},
+                                     this->h_lut_indexes);
+
+    // OPRF requires custom LUT degrees
+    for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
+      *luts->get_degree(i) = lut_degrees[i];
+    }

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_corrections);
@@ -170,8 +191,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->num_random_input_blocks =
-        (num_input_random_bits + message_bits_per_block - 1) /
-        message_bits_per_block;
+        CEIL_DIV(num_input_random_bits, message_bits_per_block);

    this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
        streams, params, this->num_random_input_blocks, message_bits_per_block,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/rerand.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/rerand.h
@@ -3,17 +3,16 @@
 #include "integer.h"

 extern "C" {
-uint64_t
-scratch_cuda_rerand_64(CudaStreamsFFI streams, int8_t **mem_ptr,
-                       uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
-                       uint32_t ks_level, uint32_t ks_base_log,
-                       uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-                       uint32_t carry_modulus, bool allocate_gpu_memory);
+uint64_t scratch_cuda_rerand_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory);

-void cuda_rerand_64(
+void cuda_rerand_64_async(
    CudaStreamsFFI streams, void *lwe_array,
    const void *lwe_flattened_encryptions_of_zero_compact_array_in,
    int8_t *mem_ptr, void *const *ksk);

-void cleanup_cuda_rerand(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+void cleanup_cuda_rerand_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
 }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h
@@ -1,5 +1,6 @@
 #pragma once

+#include "checked_arithmetic.h"
 #include "integer_utilities.h"
 #include "keyswitch/ks_enums.h"
 #include "zk/expand.cuh"
@@ -29,34 +30,34 @@ template <typename Torus> struct int_rerand_mem {
        gpu_memory_allocated(allocate_gpu_memory) {

    tmp_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_lwes * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        safe_mul_sizeof<Torus>(num_lwes, params.big_lwe_dimension + 1),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

    tmp_ksed_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_lwes * (params.small_lwe_dimension + 1) * sizeof(Torus),
+        safe_mul_sizeof<Torus>(num_lwes, params.small_lwe_dimension + 1),
        streams.stream(0), streams.gpu_index(0), size_tracker,
        allocate_gpu_memory);

    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
-            num_lwes * sizeof(expand_job<Torus>), streams.stream(0),
+            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
            streams.gpu_index(0), size_tracker, allocate_gpu_memory));

    h_expand_jobs = static_cast<expand_job<Torus> *>(
-        malloc(num_lwes * sizeof(expand_job<Torus>)));
+        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));

    auto h_lwe_trivial_indexes =
-        static_cast<Torus *>(malloc(num_lwes * sizeof(Torus)));
+        static_cast<Torus *>(malloc(safe_mul_sizeof<Torus>(num_lwes)));
    for (auto i = 0; i < num_lwes; ++i) {
      h_lwe_trivial_indexes[i] = i;
    }
    lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_lwes * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-        size_tracker, allocate_gpu_memory);
+        safe_mul_sizeof<Torus>(num_lwes), streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_trivial_indexes,
-                             num_lwes * sizeof(Torus), streams.stream(0),
-                             streams.gpu_index(0));
+                             safe_mul_sizeof<Torus>(num_lwes),
+                             streams.stream(0), streams.gpu_index(0));

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));

@@ -67,15 +68,19 @@ template <typename Torus> struct int_rerand_mem {
    cuda_drop_with_size_tracking_async(tmp_zero_lwes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    tmp_zero_lwes = nullptr;
    cuda_drop_with_size_tracking_async(tmp_ksed_zero_lwes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    tmp_ksed_zero_lwes = nullptr;
    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    lwe_trivial_indexes = nullptr;
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    d_expand_jobs = nullptr;

    for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
      cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
@@ -85,5 +90,6 @@ template <typename Torus> struct int_rerand_mem {

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_expand_jobs);
+    h_expand_jobs = nullptr;
  }
 };
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -85,15 +85,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
-          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          shift_lut_f, gpu_memory_allocated);
+
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->broadcast_lut(active_streams);
+      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -172,16 +168,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
-          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->broadcast_lut(active_streams);
-
+      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -271,16 +261,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return shifted | padding;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          shift_last_block_lut_univariate->get_lut(0, 0),
-          shift_last_block_lut_univariate->get_degree(0),
-          shift_last_block_lut_univariate->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
      auto active_streams_shift_last =
          streams.active_gpu_subset(1, params.pbs_type);
-      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
+      shift_last_block_lut_univariate->generate_and_broadcast_lut(
+          active_streams_shift_last, {0}, {last_block_lut_f},
+          LUT_0_FOR_ALL_BLOCKS);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -298,15 +283,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      return (params.message_modulus - 1) * x_sign_bit;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        padding_block_lut_univariate->get_lut(0, 0),
-        padding_block_lut_univariate->get_degree(0),
-        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        padding_block_lut_f, gpu_memory_allocated);
-    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-    padding_block_lut_univariate->broadcast_lut(active_streams);
+    padding_block_lut_univariate->generate_and_broadcast_lut(
+        active_streams, {0}, {padding_block_lut_f}, LUT_0_FOR_ALL_BLOCKS);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -339,16 +317,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return message_of_current_block + carry_of_previous_block;
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          shift_blocks_lut_bivariate->get_lut(0, 0),
-          shift_blocks_lut_bivariate->get_degree(0),
-          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
+      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams_shift_blocks, {0}, {blocks_lut_f},
+          LUT_0_FOR_ALL_BLOCKS);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/Show More
+++ b/Show More