feat(gpu): implement shuffle

fix(gpu): semgrep step in pcc now fails on error
chore(gpu): optimize CI
2026-04-28 03:01:21 -04:00 · 2026-04-15 15:31:13 +02:00 · 2026-04-15 14:20:34 +02:00 · 2026-04-15 12:48:31 +02:00 · 2026-04-14 16:24:33 +02:00 · 2026-04-14 11:17:11 +02:00
202 changed files with 8079 additions and 3878 deletions
--- a/.cargo/audit.toml
+++ b/.cargo/audit.toml
@@ -4,6 +4,9 @@ ignore = [
    "RUSTSEC-2024-0436",
    # Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
    "RUSTSEC-2025-0141",
+    # Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
+    # arkworks and we do not use custom loggers.
+    "RUSTSEC-2026-0097",
 ]

 [output]
--- a/.github/workflows/aws_data_tests.yml
+++ b/.github/workflows/aws_data_tests.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -16,7 +16,6 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
@@ -37,6 +36,7 @@ jobs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -79,6 +79,7 @@ jobs:
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+              - utils/tfhe-safe-serialize/**
            csprng:
              - tfhe-csprng/**
            zk_pok:
@@ -86,6 +87,8 @@ jobs:
            versionable:
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+            safe_serialize:
+              - utils/tfhe-safe-serialize/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -122,6 +125,7 @@ jobs:
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -145,7 +149,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -170,6 +174,11 @@ jobs:
        run: |
          make test_versionable

+      - name: Run tfhe-safe-serialize tests
+        if: needs.should-run.outputs.safe_serialize_test == 'true'
+        run: |
+          make test_safe_serialize
+
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -191,7 +200,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -204,7 +213,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -34,7 +34,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -14,12 +14,11 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]

 permissions:
  contents: read
@@ -32,16 +31,16 @@ jobs:
    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read  # Needed to check for file change
+      pull-requests: read # Needed to check for file change
    outputs:
      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
-          steps.changed-files.outputs.wasm_any_changed }}
+        steps.changed-files.outputs.wasm_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -63,6 +62,7 @@ jobs:
                - tfhe/js_on_wasm_tests/**
                - tfhe/web_wasm_parallel_tests/**
                - utils/tfhe-versionable/**
+                - utils/tfhe-safe-serialize/**
                - .github/workflows/aws_tfhe_wasm_tests.yml

  wasm-tests:
@@ -78,7 +78,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -92,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -105,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/backward_compat_pr_change_report.yml
+++ b/.github/workflows/backward_compat_pr_change_report.yml
@@ -6,6 +6,9 @@ name: backward_compat_pr_change_report
 on:
  pull_request:

+env:
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
 permissions:
  contents: read

@@ -14,9 +17,35 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  should-run:
+    name: backward_compat_pr_change_report/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            backward:
+              - utils/tfhe-lints/snapshots/*.json
+
  change-report:
    name: backward_compat_pr_change_report/change-report (bpr)
    runs-on: ubuntu-latest
+    needs: should-run
+    if:
+      needs.should-run.outputs.backward_report == 'true'
    permissions:
      pull-requests: write # To send and modify message in the PR
    steps:
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -19,7 +19,7 @@ on:
          - shortint_oprf
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
          - hlapi_kvstore
@@ -93,8 +93,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -108,14 +108,14 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-hlapi-erc20:
-    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
+  run-benchmarks-hlapi-erc7984:
+    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
-      command: hlapi_erc20
-      additional_file_to_parse: erc20_pbs_count.csv
+      command: hlapi_erc7984
+      additional_file_to_parse: erc7984_pbs_count.csv
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -17,6 +17,10 @@ on:
        description: "Run GPU core-crypto benchmarks"
        type: boolean
        default: true
+      run-gpu-zk-benchmarks:
+        description: "Run GPU ZK benchmarks"
+        type: boolean
+        default: true
      run-hpu-benchmarks:
        description: "Run HPU benchmarks"
        type: boolean
@@ -36,7 +40,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -91,7 +95,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc20
+      command: integer_multi_bit,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -110,7 +114,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -165,21 +169,42 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

+  run-benchmarks-gpu-zk-server:
+    name: benchmark_documentation/run-benchmarks-gpu-zk-server
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-zk-benchmarks
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: integer_zk
+      op_flavor: default
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
-      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
+      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
+      run-benchmarks-gpu-zk-server
    ]
    uses: ./.github/workflows/generate_svgs.yml
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -188,7 +213,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -37,7 +37,7 @@ on:
          - integer_zk_experimental
          - integer_aes
          - integer_aes256
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
      op_flavor:
@@ -123,8 +123,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -126,17 +126,11 @@ jobs:
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,25 +139,6 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

-      - name: Acknowledge remote instance failure
-        if: steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile != 'single-h100'
-        run: |
-          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
-          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
-          exit 1
-        env:
-          INPUTS_PROFILE: ${{ inputs.profile }}
-
-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' &&
-          steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile == 'single-h100'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  # Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
  install-dependencies:
    name: benchmark_gpu_common/install-dependencies
@@ -184,7 +159,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -333,13 +307,13 @@ jobs:

  teardown-instance:
    name: benchmark_gpu_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -42,7 +42,7 @@ env:
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
  SCHEDULING_POLICY: "MAX_PARALLELISM"
-  BENCHMARKS: "erc20"
+  BENCHMARKS: "erc7984"
  BRANCH_NAME: ${{ github.ref_name }}
  COMMIT_SHA: ${{ github.sha }}
  SLAB_SECRET: ${{ secrets.JOB_SECRET }}
@@ -94,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: |
            ~/.cargo/registry
@@ -214,14 +214,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -248,13 +248,13 @@ jobs:
          npm install && npm run deploy:emptyProxies && npx hardhat compile
        working-directory: fhevm/

-      - name: Profile erc20 no-cmux benchmark on GPU
+      - name: Profile erc7984 no-cmux benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
          BENCHMARK_TYPE="THROUGHPUT_200" \
          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
-          make -e "profile_erc20_gpu"
+          make -e "profile_erc7984_gpu"
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
@@ -333,7 +333,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -14,7 +14,7 @@ on:
          - integer
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -387,7 +387,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -77,7 +77,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -137,7 +137,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -218,7 +218,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -138,7 +138,7 @@ jobs:
      - name: Node cache restoration
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -151,7 +151,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -50,7 +50,7 @@ jobs:
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -74,7 +74,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -88,7 +88,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -209,60 +209,98 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

+  gpu-zk-server-latency-table:
+    name: generate_documentation_svgs/gpu-zk-server-latency-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: latency
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-latency
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  gpu-zk-server-throughput-table:
+    name: generate_documentation_svgs/gpu-zk-server-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: throughput
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
  # -----------------------------------------------------------
-  # ERC20 benchmarks tables
+  # ERC7984 benchmarks tables
  # -----------------------------------------------------------

-  cpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
+  cpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
      backend: cpu
      hardware_name: hpc7a.96xlarge
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
+      output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  gpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
+  gpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-gpu-svgs
    with:
      backend: gpu
      hardware_name: n3-H100-SXM5x8
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: multi_bit
      grouping_factor: 4
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
+      output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  hpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
+  hpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-hpu-svgs
    with:
      backend: hpu
      hardware_name: hpu_x1
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
+      output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -62,29 +63,24 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,13 +89,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -132,7 +121,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_core_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -77,7 +77,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,17 +25,11 @@ jobs:
    name: gpu_full_h100_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,13 +38,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: gpu_full_h100_tests/cuda-tests-linux
    needs: [ setup-instance ]
@@ -74,7 +61,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -118,13 +104,13 @@ jobs:

  teardown-instance:
    name: gpu_full_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +186,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -65,27 +66,23 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -133,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -184,14 +173,14 @@ jobs:

  teardown-instance:
    name: gpu_hlapi_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -17,8 +17,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Nightly tests will be triggered each evening 8p.m.
-    - cron: "0 20 * * *"
+    # Weekly tests will be triggered every Monday at 8p.m.
+    - cron: "0 20 * * 1"
  pull_request:


@@ -28,17 +28,48 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  should-run:
+    name: gpu_integer_long_run_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - '.github/workflows/gpu_integer_long_run_tests.yml'
+
  setup-instance:
    name: gpu_integer_long_run_tests/setup-instance
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    needs: [should-run]
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +143,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -131,6 +131,10 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          make semgrep_and_lint_gpu_code
+
      - name: Run fmt checks
        run: |
          make check_fmt_gpu
@@ -139,10 +143,6 @@ jobs:
        run: |
          make pcc_gpu

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          make semgrep_and_lint_gpu_code
-
      - name: Run semver checks on tfhe-cuda-backend
        run: |
          make semver_check_cuda_backend
@@ -176,7 +176,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,30 +64,25 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_signed_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +177,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,30 +64,25 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_unsigned_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +177,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -55,12 +55,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/zk/**
              - tfhe-zk-pok/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

@@ -76,7 +73,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +167,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -107,7 +107,7 @@ jobs:
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
--- a/.github/workflows/make_release_common_cuda.yml
+++ b/.github/workflows/make_release_common_cuda.yml
@@ -1,12 +1,36 @@
-name: make_release_cuda
+# Common workflow to make crate release for CUDA backend
+name: make_release_common_cuda

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
-      dry_run:
-        description: "Dry-run"
+      package-name:
+        type: string
+        required: true
+      dry-run:
        type: boolean
        default: true
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true
+      ALLOWED_TEAM:
+        required: true
+      READ_ORG_TOKEN:
+        required: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -21,15 +45,15 @@ permissions: {}

 jobs:
  verify-triggering-actor:
-    name: make_release_cuda/verify-triggering-actor
+    name: make_release_common_cuda/verify-triggering-actor
    if: startsWith(github.ref, 'refs/tags/')
    uses: ./.github/workflows/verify_triggering_actor.yml
    secrets:
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  setup-instance:
-    name: make_release_cuda/setup-instance
+    name: make_release_common_cuda/setup-instance
    needs: verify-triggering-actor
    runs-on: ubuntu-latest
    outputs:
@@ -37,7 +61,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,7 +71,7 @@ jobs:
          profile: gpu-build

  package:
-    name: make_release_cuda/package
+    name: make_release_common_cuda/package
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    outputs:
@@ -76,7 +100,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -89,7 +112,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -101,12 +123,14 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Prepare package
+        env:
+          PACKAGE: ${{ inputs.package-name }}
        run: |
-          cargo package -p tfhe-cuda-backend
+          cargo package -p "${PACKAGE}"

      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate

      - name: generate hash
@@ -114,8 +138,8 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
-    name: make_release_cuda/provenance
-    if: ${{ !inputs.dry_run  }}
+    name: make_release_common_cuda/provenance
+    if: ${{ !inputs.dry-run  }}
    needs: [package]
    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
@@ -128,7 +152,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish-cuda-release:
-    name: make_release_cuda/publish-cuda-release
+    name: make_release_common_cuda/publish-cuda-release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    permissions:
@@ -150,7 +174,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -163,7 +186,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -177,22 +199,23 @@ jobs:
      - name: Download artifact
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
        env:
          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+          PACKAGE: ${{ inputs.package-name }}
+          DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
+          # would fail. This is safe since dry-run is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-cuda-backend ${DRY_RUN}
+          cargo publish -p "${PACKAGE}" ${DRY-RUN}

      - name: Generate hash
        id: published_hash
@@ -204,7 +227,7 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -212,17 +235,17 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: make_release_cuda/teardown-instance
+    name: make_release_common_cuda/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, publish-cuda-release]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -235,4 +258,4 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -16,6 +16,10 @@ on:
        description: "Push web js package"
        type: boolean
        default: true
+      push_web_compat_package:
+        description: "Push web compat (cross-origin) js package"
+        type: boolean
+        default: true
      push_node_package:
        description: "Push node js package"
        type: boolean
@@ -99,6 +103,23 @@ jobs:
          tag: ${{ env.NPM_TAG }}
          provenance: true

+      - name: Build web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        run: |
+          rm -rf tfhe/pkg
+
+          make build_web_js_api
+          sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
+
+      - name: Publish web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
+        with:
+          package: tfhe/pkg/package.json
+          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}
+          provenance: true
+
      - name: Build Node package
        if: ${{ inputs.push_node_package }}
        run: |
--- a/.github/workflows/make_release_tfhe_cuda.yml
+++ b/.github/workflows/make_release_tfhe_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of tfhe-rs CUDA backend on crates.io.
+name: make_release_tfhe_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "tfhe-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/make_release_tfhe_safe_serialize.yml
+++ b/.github/workflows/make_release_tfhe_safe_serialize.yml
@@ -0,0 +1,32 @@
+name: make_release_tfhe_safe_serialize
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_safe_serialize/make-release
+    uses: ./.github/workflows/make_release_common.yml
+    with:
+      package-name: "tfhe-safe-serialize"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
--- a/.github/workflows/make_release_zk_cuda.yml
+++ b/.github/workflows/make_release_zk_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
+name: make_release_zk_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_zk_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "zk-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Restore Sagemath image from cache
        id: docker-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
      - name: Store Sagemath image in cache
        if: steps.docker-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,7 @@ members = [
    "utils/tfhe-backward-compat-checker",
    "utils/tfhe-backward-compat-data",
    "utils/tfhe-backward-compat-data/crates/add_new_version",
+    "utils/tfhe-safe-serialize",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/wasm-par-mq",
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/47
+++ b/47
@@ -312,7 +312,7 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
 	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
 		| grep -v '/cmake-build-debug/' \
 		| grep -v '/build/' \
-		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
+		| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
 	venv/bin/python3 "scripts/check_scratch_cleanup.py"

 .PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
@@ -557,6 +557,11 @@ clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-versionable -- --no-deps -D warnings

+.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
+clippy_safe_serialize: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-safe-serialize -- --no-deps -D warnings
+
 .PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
 clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -592,7 +597,7 @@ clippy_test_vectors: install_rs_check_toolchain
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
 clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq

 .PHONY: clippy_fast # Run main clippy targets
@@ -1270,6 +1275,11 @@ test_versionable:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		--all-targets -p tfhe-versionable

+.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
+test_safe_serialize:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--all-targets -p tfhe-safe-serialize
+
 # The backward compat data folder holds historical binary data but also rust code to generate and load them.
 .PHONY: gen_backward_compat_data # Re-generate backward compatibility data
 gen_backward_compat_data:
@@ -1924,25 +1934,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
-bench_hlapi_erc20: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
+bench_hlapi_erc7984: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
-bench_hlapi_erc20_gpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
+bench_hlapi_erc7984_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
-bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
+bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
@@ -1966,13 +1976,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
-bench_hlapi_erc20_hpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc7984_hpu: install_rs_check_toolchain
 	source ./setup_hpu.sh --config $(HPU_CONFIG); \
 	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
@@ -2028,10 +2038,10 @@ bench_summary: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'

 	# DEX
@@ -2073,10 +2083,10 @@ bench_summary_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'

 	# DEX
@@ -2266,6 +2276,7 @@ pcc_batch_6:
 	$(call run_recipe_with_details,clippy_zk_pok_wasm)
 	$(call run_recipe_with_details,clippy_trivium)
 	$(call run_recipe_with_details,clippy_versionable)
+	$(call run_recipe_with_details,clippy_safe_serialize)
 	$(call run_recipe_with_details,clippy_param_dedup)
 	$(call run_recipe_with_details,docs)

--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -36,5 +36,19 @@ void cuda_glwe_sample_extract_128_async(
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
    uint32_t glwe_dimension, uint32_t polynomial_size);
+
+void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
+                                            void *lwe_array_out,
+                                            void *lwe_array_in, uint32_t size,
+                                            uint32_t log_modulus,
+                                            uint32_t degree,
+                                            uint32_t grouping_factor);
+
+void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
+                                             void *lwe_array_out,
+                                             void *lwe_array_in, uint32_t size,
+                                             uint32_t log_modulus,
+                                             uint32_t degree,
+                                             uint32_t grouping_factor);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -1039,6 +1039,24 @@ void cuda_cast_to_signed_64_async(CudaStreamsFFI streams,

 void cleanup_cuda_cast_to_signed_64(CudaStreamsFFI streams,
                                    int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_bitonic_sort_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t num_values, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool is_signed,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_integer_bitonic_sort_64_async(CudaStreamsFFI streams,
+                                        CudaRadixCiphertextFFI **values,
+                                        uint32_t num_values, int8_t *mem_ptr,
+                                        void *const *bsks, void *const *ksks,
+                                        int32_t direction);
+
+void cleanup_cuda_integer_bitonic_sort_64(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void);
 } // extern C

 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shuffle_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shuffle_utilities.h
@@ -0,0 +1,217 @@
+#pragma once
+#include "checked_arithmetic.h"
+#include "comparison.h"
+#include "integer_utilities.h"
+
+template <typename Torus> struct int_bitonic_sort_buffer {
+  int_radix_params params;
+  uint32_t max_num_pairs;
+  uint32_t num_radix_blocks;
+
+  int_comparison_buffer<Torus> *comparison_mem;
+  CudaRadixCiphertextFFI *comparison_results;
+
+  // Unsigned compare path: K*N packed, K*N/2 diffs, K*N/2 tree scratch (x, y).
+  CudaRadixCiphertextFFI *batch_cmp_packed;
+  CudaRadixCiphertextFFI *batch_cmp_comparisons;
+  CudaRadixCiphertextFFI *batch_cmp_tree_x;
+  CudaRadixCiphertextFFI *batch_cmp_tree_y;
+
+  int_radix_lut<Torus> *batch_identity_lut;
+  int_radix_lut<Torus> *batch_is_non_zero_lut;
+  int_radix_lut<Torus> *batch_inner_tree_leaf_lut;
+  int_radix_lut<Torus> *batch_last_tree_leaf_lut;
+  Torus *preallocated_h_lut;
+
+  // Batched cmux: 4KN = [true-side (2KN)] [false-side (2KN)]; condition
+  // broadcast per block.
+  CudaRadixCiphertextFFI *batch_buffer_in;
+  CudaRadixCiphertextFFI *batch_buffer_out;
+  CudaRadixCiphertextFFI *batch_condition;
+
+  int_radix_lut<Torus> *batch_predicate_lut;
+  int_radix_lut<Torus> *batch_message_extract_lut;
+
+  bool is_signed;
+  bool gpu_memory_allocated;
+
+  int_bitonic_sort_buffer(CudaStreams streams, int_radix_params params,
+                          uint32_t num_radix_blocks, uint32_t num_values,
+                          bool is_signed, bool allocate_gpu_memory,
+                          uint64_t &size_tracker) {
+    this->params = params;
+    this->is_signed = is_signed;
+    this->gpu_memory_allocated = allocate_gpu_memory;
+    this->num_radix_blocks = num_radix_blocks;
+    this->max_num_pairs = num_values / 2;
+
+    uint32_t K = max_num_pairs;
+    uint32_t N = num_radix_blocks;
+    uint32_t packed_per_pair = N / 2;
+    uint32_t total_bivariate_blocks = 4 * K * N;
+    uint32_t total_result_blocks = 2 * K * N;
+
+    comparison_mem = new int_comparison_buffer<Torus>(
+        streams, COMPARISON_TYPE::GT, params, num_radix_blocks, is_signed,
+        allocate_gpu_memory, size_tracker);
+
+    comparison_results = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), comparison_results, K,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    batch_cmp_packed = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_cmp_packed, K * N,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    batch_cmp_comparisons = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_cmp_comparisons,
+        K * packed_per_pair, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    batch_cmp_tree_x = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_cmp_tree_x,
+        K * packed_per_pair, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    batch_cmp_tree_y = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_cmp_tree_y,
+        K * packed_per_pair, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    Torus total_modulus = params.message_modulus * params.carry_modulus;
+
+    batch_identity_lut = new int_radix_lut<Torus>(
+        streams, params, 1, K * N, allocate_gpu_memory, size_tracker);
+    auto active_id = streams.active_gpu_subset(K * N, params.pbs_type);
+    batch_identity_lut->generate_and_broadcast_lut(
+        active_id, {0}, {[](Torus x) -> Torus { return x; }},
+        LUT_0_FOR_ALL_BLOCKS);
+
+    batch_is_non_zero_lut =
+        new int_radix_lut<Torus>(streams, params, 1, K * packed_per_pair,
+                                 allocate_gpu_memory, size_tracker);
+    auto active_nz =
+        streams.active_gpu_subset(K * packed_per_pair, params.pbs_type);
+    batch_is_non_zero_lut->generate_and_broadcast_lut(
+        active_nz, {0}, {[total_modulus](Torus x) -> Torus {
+          return (x % total_modulus) != 0;
+        }},
+        LUT_0_FOR_ALL_BLOCKS);
+
+    batch_inner_tree_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, K * packed_per_pair,
+                                 allocate_gpu_memory, size_tracker);
+    std::function<Torus(Torus, Torus)> block_selector_f =
+        [](Torus msb, Torus lsb) -> Torus {
+      return (msb == IS_EQUAL) ? lsb : msb;
+    };
+    batch_inner_tree_leaf_lut->generate_and_broadcast_bivariate_lut(
+        active_nz, {0}, {block_selector_f}, LUT_0_FOR_ALL_BLOCKS);
+
+    batch_last_tree_leaf_lut = new int_radix_lut<Torus>(
+        streams, params, 1, K, allocate_gpu_memory, size_tracker);
+
+    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
+        params.glwe_dimension + 1, params.polynomial_size));
+
+    batch_buffer_in = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_buffer_in,
+        total_bivariate_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    batch_buffer_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_buffer_out,
+        total_bivariate_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    batch_condition = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), batch_condition,
+        total_bivariate_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    // Pick true/false branch where cond == IS_SUPERIOR: LUT 0 zeros false-side,
+    // LUT 1 zeros true-side; the two halves are summed after the PBS.
+    batch_predicate_lut =
+        new int_radix_lut<Torus>(streams, params, 2, total_bivariate_blocks,
+                                 allocate_gpu_memory, size_tracker);
+
+    auto pred_f = [](Torus x) -> Torus { return x == IS_SUPERIOR; };
+    auto active_pred =
+        streams.active_gpu_subset(total_bivariate_blocks, params.pbs_type);
+    batch_predicate_lut->generate_and_broadcast_bivariate_lut(
+        active_pred, {0, 1},
+        {[pred_f](Torus b, Torus c) -> Torus { return pred_f(c) ? b : 0; },
+         [pred_f](Torus b, Torus c) -> Torus { return pred_f(c) ? 0 : b; }},
+        [total_result_blocks](Torus *idx, uint32_t) {
+          for (uint32_t i = 0; i < 2 * total_result_blocks; i++)
+            idx[i] = (i < total_result_blocks) ? 0 : 1;
+        });
+
+    batch_message_extract_lut =
+        new int_radix_lut<Torus>(streams, params, 1, total_result_blocks,
+                                 allocate_gpu_memory, size_tracker);
+    auto active_msg =
+        streams.active_gpu_subset(total_result_blocks, params.pbs_type);
+    batch_message_extract_lut->generate_and_broadcast_lut(
+        active_msg, {0},
+        {[params](Torus x) -> Torus { return x % params.message_modulus; }},
+        LUT_0_FOR_ALL_BLOCKS);
+  }
+
+  void release(CudaStreams streams) {
+    comparison_mem->release(streams);
+    delete comparison_mem;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   comparison_results, gpu_memory_allocated);
+    delete comparison_results;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_cmp_packed, gpu_memory_allocated);
+    delete batch_cmp_packed;
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_cmp_comparisons, gpu_memory_allocated);
+    delete batch_cmp_comparisons;
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_cmp_tree_x, gpu_memory_allocated);
+    delete batch_cmp_tree_x;
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_cmp_tree_y, gpu_memory_allocated);
+    delete batch_cmp_tree_y;
+
+    batch_identity_lut->release(streams);
+    delete batch_identity_lut;
+    batch_is_non_zero_lut->release(streams);
+    delete batch_is_non_zero_lut;
+    batch_inner_tree_leaf_lut->release(streams);
+    delete batch_inner_tree_leaf_lut;
+    batch_last_tree_leaf_lut->release(streams);
+    delete batch_last_tree_leaf_lut;
+    free(preallocated_h_lut);
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_buffer_in, gpu_memory_allocated);
+    delete batch_buffer_in;
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_buffer_out, gpu_memory_allocated);
+    delete batch_buffer_out;
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   batch_condition, gpu_memory_allocated);
+    delete batch_condition;
+
+    batch_predicate_lut->release(streams);
+    delete batch_predicate_lut;
+    batch_message_extract_lut->release(streams);
+    delete batch_message_extract_lut;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -39,6 +39,28 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
                                                      uint32_t gpu_index,
                                                      int8_t **pbs_buffer);

+// Noise-tests-namespaced wrappers for scratch/cleanup, so that callers
+// working with the noise-tests PBS variant use a consistent naming scheme.
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
+
+// Noise tests variant: 64-bit torus, polynomial_size=2048 only. Uses the
+// NOISE_TESTS keybundle mode for noise analysis purposes.
+void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
@@ -56,6 +78,23 @@ void cuda_multi_bit_programmable_bootstrap_128_async(
 void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
                                                       const uint32_t gpu_index,
                                                       int8_t **buffer);
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
+
+void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
  uint32_t num_lwes;
  uint32_t num_compact_lists;

-  int_radix_lut<Torus> *message_and_carry_extract_luts;
-  int_radix_lut<Torus> *identity_lut;
+  int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
+  int_radix_lut<Torus> *identity_lut = nullptr;

-  Torus *tmp_expanded_lwes;
-  Torus *tmp_ksed_small_to_big_expanded_lwes;
+  Torus *tmp_expanded_lwes = nullptr;
+  Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;

  bool gpu_memory_allocated;

@@ -148,66 +148,6 @@ template <typename Torus> struct zk_expand_mem {
      PANIC("GPU backend requires carry_modulus equal to message_modulus")
    }

-    // We create the identity LUT only if we are doing a SANITY_CHECK
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut =
-          new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
-                                   allocate_gpu_memory, size_tracker);
-
-      auto identity_lut_f = [](Torus x) -> Torus { return x; };
-
-      identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
-                                               LUT_0_FOR_ALL_BLOCKS);
-    }
-
-    auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
-      return x % casting_params.message_modulus;
-    };
-    auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
-      return (x / casting_params.carry_modulus) %
-             casting_params.message_modulus;
-    };
-
-    // Booleans have to be sanitized
-    auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
-    auto message_extract_and_sanitize_bool_lut_f =
-        [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-      return sanitize_bool_f(message_extract_lut_f(x));
-    };
-    auto carry_extract_and_sanitize_bool_lut_f =
-        [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-      return sanitize_bool_f(carry_extract_lut_f(x));
-    };
-
-    /** In case the casting key casts from BIG to SMALL key we run a single KS
-    to expand using the casting key as ksk. Otherwise, in case the casting key
-    casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
-    the casting key as ksk, then we keyswitch from BIG to SMALL using the
-    computing ksk, and lastly we apply the PBS. The output is always on the
-    BIG key.
-    **/
-    auto params = casting_params;
-    if (casting_key_type == SMALL_TO_BIG) {
-      params = computing_params;
-    }
-    message_and_carry_extract_luts = new int_radix_lut<Torus>(
-        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
-
-    // We are always packing two LWEs. We just need to be sure we have enough
-    // space in the carry part to store a message of the same size as is in the
-    // message part.
-    if (params.carry_modulus < params.message_modulus)
-      PANIC("Carry modulus must be at least as large as message modulus");
-    auto num_packed_msgs = 2;
-
-    // Adjust indexes to permute the output and access the correct LUT
-    auto h_indexes_in = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-    auto h_indexes_out = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-    auto h_lut_indexes = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-
    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
@@ -216,144 +156,202 @@ template <typename Torus> struct zk_expand_mem {
    h_expand_jobs = static_cast<expand_job<Torus> *>(
        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));

-    /*
-     * Each LWE contains encrypted data in both carry and message spaces
-     * that needs to be extracted.
-     *
-     * The loop processes each compact list (k) and for each LWE within that
-     * list:
-     * 1. Sets input indexes to read each LWE twice (for carry and message
-     * extraction)
-     * 2. Creates output indexes to properly reorder the results
-     * 3. Selects appropriate LUT index based on whether boolean sanitization is
-     * needed
-     *
-     * We want the output to have always first the content of the message part
-     * and then the content of the carry part of each LWE.
-     *
-     * i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
-     * carry_extract(LWE_1), ...
-     *
-     * Aiming that behavior, with 4 LWEs we would have:
-     *
-     * // Each LWE is processed twice
-     * h_indexes_in   = {0, 1, 2, 3, 0, 1, 2, 3}
-     *
-     * // First 4 use message LUT, last 4 use carry LUT
-     * h_lut_indexes  = {0, 0, 0, 0, 1, 1, 1, 1}
-     *
-     * // Reorders output so message and carry for each LWE appear together
-     * h_indexes_out  = {0, 2, 4, 6, 1, 3, 5, 7}
-     *
-     * If an LWE contains a boolean value, its LUT index is shifted by
-     * num_packed_msgs to use the sanitization LUT (which ensures output is
-     * exactly 0 or 1).
-     */
-    auto offset = 0;
-    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
-      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
-        auto lwe_index = i + num_packed_msgs * offset;
-        auto lwe_index_in_list = i % num_lwes_in_kth;
-        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
-                       "Cuda error: index %d is beyond the max value %d",
-                       lwe_index, num_packed_msgs * num_lwes);
-        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
-        h_indexes_out[lwe_index] =
-            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %lu is beyond the max value %lu",
-                       (unsigned long)h_indexes_in[lwe_index],
-                       (unsigned long)(num_packed_msgs * num_lwes));
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %lu is beyond the max value %lu",
-                       (unsigned long)h_indexes_out[lwe_index],
-                       (unsigned long)(num_packed_msgs * num_lwes));
-        // is_boolean_array tells us which input is a boolean and thus the
-        // related output needs boolean sanitization. It naturally has
-        // total_blocks entries, but h_indexes_out reaches
-        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
-        // the ceiling causes out-of-bounds access. Reading garbage "true" would
-        // set h_lut_indexes to an invalid index pointing to uninitialized
-        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
-        // to match.
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
-                       "Cuda error: index %lu for is_boolean_array is out of "
-                       "bounds (len is %lu)",
-                       (unsigned long)h_indexes_out[lwe_index],
-                       (unsigned long)is_boolean_array_len);
+    // NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
+    // no intermediate buffers needed.
+    if (expand_kind != EXPAND_KIND::NO_CASTING) {
+      /** In case the casting key casts from BIG to SMALL key we run a single KS
+      to expand using the casting key as ksk. Otherwise, in case the casting key
+      casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
+      the casting key as ksk, then we keyswitch from BIG to SMALL using the
+      computing ksk, and lastly we apply the PBS. The output is always on the
+      BIG key.
+      **/
+      auto params = casting_params;
+      if (casting_key_type == SMALL_TO_BIG) {
+        params = computing_params;
      }
-      offset += num_lwes_in_kth;
-    }

-    message_and_carry_extract_luts->set_lwe_indexes(
-        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
+      // We always pack two LWEs (message and carry parts per LWE)
+      auto num_packed_msgs = 2;

-    auto active_streams =
-        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
+      // Adjust indexes to permute the output and access the correct LUT.
+      //
+      // The loop below fills h_indexes_in and h_indexes_out so that the output
+      // is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
+      // msg_extract(LWE_1), carry_extract(LWE_1), ...
+      //
+      // With 4 LWEs the arrays look like:
+      //   h_indexes_in  = {0, 1, 2, 3, 0, 1, 2, 3}  (each LWE read twice)
+      //   h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}  (msg LUT then carry LUT)
+      //   h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}  (interleaved output)
+      //
+      // If an LWE contains a boolean its LUT index is shifted by
+      // num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
+      auto h_indexes_in = static_cast<Torus *>(
+          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+      auto h_indexes_out = static_cast<Torus *>(
+          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));

-    // Index generator for message/carry extraction LUTs
-    auto index_gen = [num_compact_lists,
-                      num_lwes_per_compact_list =
-                          this->num_lwes_per_compact_list,
-                      num_packed_msgs, is_boolean_array,
-                      h_indexes_out](Torus *h_lut_indexes, uint32_t) {
      auto offset = 0;
      for (int k = 0; k < num_compact_lists; k++) {
-        auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+        auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
        for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
          auto lwe_index = i + num_packed_msgs * offset;
-          auto boolean_offset =
-              is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
-          h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+          auto lwe_index_in_list = i % num_lwes_in_kth;
+          PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                         "Cuda error: index %d is beyond the max value %d",
+                         lwe_index, num_packed_msgs * num_lwes);
+          h_indexes_in[lwe_index] = lwe_index_in_list + offset;
+          h_indexes_out[lwe_index] =
+              num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
+          PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                         "Cuda error: index %lu is beyond the max value %lu",
+                         (unsigned long)h_indexes_in[lwe_index],
+                         (unsigned long)(num_packed_msgs * num_lwes));
+          PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                         "Cuda error: index %lu is beyond the max value %lu",
+                         (unsigned long)h_indexes_out[lwe_index],
+                         (unsigned long)(num_packed_msgs * num_lwes));
+          // is_boolean_array tells us which input is a boolean and thus the
+          // related output needs boolean sanitization. It naturally has
+          // total_blocks entries, but h_indexes_out reaches
+          // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
+          // odd, the ceiling causes out-of-bounds access. Reading garbage
+          // "true" would set h_lut_indexes to an invalid index pointing to
+          // uninitialized memory instead of a real LUT. Rust pads
+          // is_boolean_array with FALSE to match.
+          PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                         "Cuda error: index %lu for is_boolean_array is out of "
+                         "bounds (len is %lu)",
+                         (unsigned long)h_indexes_out[lwe_index],
+                         (unsigned long)is_boolean_array_len);
        }
        offset += num_lwes_in_kth;
      }
-    };

-    message_and_carry_extract_luts->generate_and_broadcast_lut(
-        active_streams, {0, 1, 2, 3},
-        {message_extract_lut_f, carry_extract_lut_f,
-         message_extract_and_sanitize_bool_lut_f,
-         carry_extract_and_sanitize_bool_lut_f},
-        index_gen, true, {}, h_lut_indexes);
+      auto active_streams =
+          streams.active_gpu_subset(2 * num_lwes, params.pbs_type);

-    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
-        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
-    // The expanded LWEs will always be on the casting key format
-    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
-        streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
+      // SANITY_CHECK uses identity_lut (skipping the full message/carry
+      // extraction LUT and the SMALL_TO_BIG intermediate buffer).
+      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        identity_lut =
+            new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
+                                     allocate_gpu_memory, size_tracker);

-    tmp_ksed_small_to_big_expanded_lwes =
-        (Torus *)cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<Torus>(num_lwes,
-                                   casting_params.big_lwe_dimension + 1),
-            streams.stream(0), streams.gpu_index(0), size_tracker,
-            allocate_gpu_memory);
+        auto identity_lut_f = [](Torus x) -> Torus { return x; };
+        identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
+                                                 LUT_0_FOR_ALL_BLOCKS);
+        identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
+                                      h_indexes_in, h_indexes_out);
+        identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
+            active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
+      } else {
+        // We are always packing two LWEs. We just need to be sure we have
+        // enough space in the carry part to store a message of the same size
+        // as is in the message part.
+        if (params.carry_modulus < params.message_modulus)
+          PANIC("Carry modulus must be at least as large as message modulus");
+
+        message_and_carry_extract_luts =
+            new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
+                                     allocate_gpu_memory, size_tracker);
+        message_and_carry_extract_luts->set_lwe_indexes(
+            streams.stream(0), streams.gpu_index(0), h_indexes_in,
+            h_indexes_out);
+
+        auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
+          return x % casting_params.message_modulus;
+        };
+        auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
+          return (x / casting_params.carry_modulus) %
+                 casting_params.message_modulus;
+        };
+        auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
+        auto message_extract_and_sanitize_bool_lut_f =
+            [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+          return sanitize_bool_f(message_extract_lut_f(x));
+        };
+        auto carry_extract_and_sanitize_bool_lut_f =
+            [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+          return sanitize_bool_f(carry_extract_lut_f(x));
+        };
+
+        auto h_lut_indexes = static_cast<Torus *>(
+            malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+
+        auto index_gen = [num_compact_lists,
+                          num_lwes_per_compact_list =
+                              this->num_lwes_per_compact_list,
+                          num_packed_msgs, is_boolean_array,
+                          h_indexes_out](Torus *h_lut_indexes, uint32_t) {
+          auto offset = 0;
+          for (int k = 0; k < num_compact_lists; k++) {
+            auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+            for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+              auto lwe_index = i + num_packed_msgs * offset;
+              auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
+                                        ? num_packed_msgs
+                                        : 0;
+              h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+            }
+            offset += num_lwes_in_kth;
+          }
+        };
+
+        message_and_carry_extract_luts->generate_and_broadcast_lut(
+            active_streams, {0, 1, 2, 3},
+            {message_extract_lut_f, carry_extract_lut_f,
+             message_extract_and_sanitize_bool_lut_f,
+             carry_extract_and_sanitize_bool_lut_f},
+            index_gen, true, {}, h_lut_indexes);
+        message_and_carry_extract_luts
+            ->allocate_lwe_vector_for_non_trivial_indexes(
+                active_streams, 2 * num_lwes, size_tracker,
+                allocate_gpu_memory);
+        free(h_lut_indexes);
+
+        // SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
+        // on the full casting path.
+        tmp_ksed_small_to_big_expanded_lwes =
+            (Torus *)cuda_malloc_with_size_tracking_async(
+                safe_mul_sizeof<Torus>(num_lwes,
+                                       casting_params.big_lwe_dimension + 1),
+                streams.stream(0), streams.gpu_index(0), size_tracker,
+                allocate_gpu_memory);
+      }
+
+      // The expanded LWEs will always be on the casting key format
+      tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
+          safe_mul_sizeof<Torus>(num_lwes,
+                                 casting_params.big_lwe_dimension + 1),
+          streams.stream(0), streams.gpu_index(0), size_tracker,
+          allocate_gpu_memory);
+
+      free(h_indexes_in);
+      free(h_indexes_out);
+    }

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-    free(h_indexes_in);
-    free(h_indexes_out);
-    free(h_lut_indexes);
  }

  void release(CudaStreams streams) {
-    message_and_carry_extract_luts->release(streams);
-    delete message_and_carry_extract_luts;
-
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut->release(streams);
-      delete identity_lut;
+    if (expand_kind != EXPAND_KIND::NO_CASTING) {
+      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        identity_lut->release(streams);
+        delete identity_lut;
+      } else {
+        message_and_carry_extract_luts->release(streams);
+        delete message_and_carry_extract_luts;
+        cuda_drop_with_size_tracking_async(
+            tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
+            streams.gpu_index(0), gpu_memory_allocated);
+      }
+      cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
+                                         streams.gpu_index(0),
+                                         gpu_memory_allocated);
    }

-    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
-                                       streams.stream(0), streams.gpu_index(0),
-                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -150,3 +150,31 @@ void cuda_glwe_sample_extract_128_async(
          "N's are powers of two in the interval [256..4096].")
  }
 }
+
+void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
+                                            void *lwe_array_out,
+                                            void *lwe_array_in, uint32_t size,
+                                            uint32_t log_modulus,
+                                            uint32_t degree,
+                                            uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
+
+void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
+                                             void *lwe_array_out,
+                                             void *lwe_array_in, uint32_t size,
+                                             uint32_t log_modulus,
+                                             uint32_t degree,
+                                             uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<__uint128_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -463,5 +463,48 @@ __global__ void __launch_bounds__(512)
      return;
  }
 }
+// This function is only used for noise tests, it follows the same logic
+// that is embedded in the keybundle just we need a global function to
+// be able to test it individually.
+template <typename Torus, class params>
+__global__ void
+modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
+                         uint32_t log_modulus, uint32_t grouping_factor) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    int num_monomials = 1 << grouping_factor;
+    int input_offset = tid * grouping_factor;
+    int output_offset = tid * num_monomials;
+    // We calculate all monomials even if the first one is never used.
+    for (int ggsw_idx = 0; ggsw_idx < num_monomials; ggsw_idx++) {
+      array_out[ggsw_idx + output_offset] =
+          calculates_monomial_degree<Torus, params>(&array_in[input_offset],
+                                                    ggsw_idx, grouping_factor);
+    }
+  }
+}
+// This aims to be launched only from the noise tests.
+//  That is why we support a specific set of parameters
+template <typename Torus>
+__host__ void host_modulus_switch_multi_bit(
+    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
+    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  int multibit_size = size / grouping_factor;
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
+  switch (degree) {
+  case 2048:
+    modulus_switch_multi_bit<Torus, Degree<2048>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [2048].")
+  };
+
+  check_cuda_error(cudaGetLastError());
+}

 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shuffle.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shuffle.cu
@@ -0,0 +1,48 @@
+#include "integer/shuffle.cuh"
+
+uint64_t scratch_cuda_integer_bitonic_sort_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t num_values, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool is_signed,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  PUSH_RANGE("scratch bitonic sort")
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  uint64_t ret = scratch_cuda_integer_bitonic_sort<uint64_t>(
+      CudaStreams(streams), (int_bitonic_sort_buffer<uint64_t> **)mem_ptr,
+      num_radix_blocks, num_values, params, is_signed, allocate_gpu_memory);
+  POP_RANGE()
+  return ret;
+}
+
+void cuda_integer_bitonic_sort_64_async(CudaStreamsFFI streams,
+                                        CudaRadixCiphertextFFI **values,
+                                        uint32_t num_values, int8_t *mem_ptr,
+                                        void *const *bsks, void *const *ksks,
+                                        int32_t direction) {
+
+  PUSH_RANGE("bitonic sort")
+  host_bitonic_sort<uint64_t>(CudaStreams(streams), values, num_values,
+                              (int_bitonic_sort_buffer<uint64_t> *)mem_ptr,
+                              bsks, (uint64_t **)(ksks), direction);
+  POP_RANGE()
+}
+
+void cleanup_cuda_integer_bitonic_sort_64(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void) {
+
+  PUSH_RANGE("cleanup bitonic sort")
+  int_bitonic_sort_buffer<uint64_t> *mem_ptr =
+      (int_bitonic_sort_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(CudaStreams(streams));
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+  POP_RANGE()
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shuffle.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shuffle.cuh
@@ -0,0 +1,336 @@
+#ifndef TFHE_RS_SHUFFLE_CUH
+#define TFHE_RS_SHUFFLE_CUH
+
+#include "integer/comparison.cuh"
+#include "integer/shuffle_utilities.h"
+#include "linearalgebra/addition.cuh"
+#include "radix_ciphertext.cuh"
+
+template <typename Torus>
+__host__ uint64_t scratch_cuda_integer_bitonic_sort(
+    CudaStreams streams, int_bitonic_sort_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, uint32_t num_values, int_radix_params params,
+    bool is_signed, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_bitonic_sort_buffer<Torus>(
+      streams, params, num_radix_blocks, num_values, is_signed,
+      allocate_gpu_memory, size_tracker);
+  return size_tracker;
+}
+
+// Reduce K groups of M sign blocks {INF=0, EQ=1, SUP=2} to K final signs by
+// pairwise merge (msb == EQ ? lsb : msb), then apply sign_handler_f.
+template <typename Torus, typename KSTorus>
+__host__ void batched_tree_sign_reduction(
+    CudaStreams streams, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI *input, uint32_t K, uint32_t blocks_per_group,
+    int_bitonic_sort_buffer<Torus> *mem_ptr, void *const *bsks,
+    KSTorus *const *ksks, std::function<Torus(Torus)> sign_handler_f) {
+
+  auto params = mem_ptr->params;
+  auto message_modulus = params.message_modulus;
+  auto x = mem_ptr->batch_cmp_tree_x;
+  auto y = mem_ptr->batch_cmp_tree_y;
+  uint32_t total_blocks = K * blocks_per_group;
+
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), x, 0, total_blocks, input, 0,
+      total_blocks);
+
+  // Inner levels: K*M -> K*2.
+  while (blocks_per_group > 2) {
+    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
+                       total_blocks, message_modulus);
+    total_blocks >>= 1;
+    blocks_per_group >>= 1;
+    integer_radix_apply_univariate_lookup_table<Torus>(
+        streams, x, y, bsks, ksks, mem_ptr->batch_inner_tree_leaf_lut,
+        total_blocks);
+  }
+
+  // Last level: merge the final pair (if any) and apply sign_handler_f.
+  auto last_lut = mem_ptr->batch_last_tree_leaf_lut;
+  auto num_bits = log2_int(message_modulus);
+  std::function<Torus(Torus)> f;
+
+  if (blocks_per_group == 2) {
+    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
+                       total_blocks, message_modulus);
+    total_blocks >>= 1;
+    f = [sign_handler_f, num_bits, message_modulus](Torus x) -> Torus {
+      Torus msb = (x >> num_bits) & (message_modulus - 1);
+      Torus lsb = x & (message_modulus - 1);
+      return sign_handler_f((msb == IS_EQUAL) ? lsb : msb);
+    };
+  } else {
+    y = x;
+    f = sign_handler_f;
+  }
+
+  auto active = streams.active_gpu_subset(total_blocks, params.pbs_type);
+  last_lut->generate_and_broadcast_lut(active, {0}, {f}, LUT_0_FOR_ALL_BLOCKS,
+                                       true, {mem_ptr->preallocated_h_lut});
+  integer_radix_apply_univariate_lookup_table<Torus>(streams, output, y, bsks,
+                                                     ksks, last_lut, K);
+}
+
+// Batched unsigned comparison for all K pairs selected by j_param in one
+// sub-step: pack -> identity PBS -> subtract -> is_non_zero+1 -> tree reduce.
+// Result: K sign blocks in mem_ptr->comparison_results.
+template <typename Torus, typename KSTorus>
+__host__ void host_batched_unsigned_comparison(
+    CudaStreams streams, CudaRadixCiphertextFFI **values, uint32_t num_values,
+    uint32_t k_param, uint32_t j_param, int_bitonic_sort_buffer<Torus> *mem_ptr,
+    void *const *bsks, KSTorus *const *ksks) {
+
+  auto N = mem_ptr->num_radix_blocks;
+  auto params = mem_ptr->params;
+  auto message_modulus = params.message_modulus;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  uint32_t packed_per_pair = N / 2;
+  uint32_t half = mem_ptr->max_num_pairs * packed_per_pair;
+
+  // Gather + pack each pair (values[i], values[l]) into left/right halves.
+  uint32_t K = 0;
+  for (uint32_t i = 0; i < num_values; i++) {
+    uint32_t l = i ^ j_param;
+    if (l <= i)
+      continue;
+
+    CudaRadixCiphertextFFI lp;
+    as_radix_ciphertext_slice<Torus>(&lp, mem_ptr->batch_cmp_packed,
+                                     K * packed_per_pair,
+                                     (K + 1) * packed_per_pair);
+    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &lp, values[i],
+                       N, message_modulus);
+
+    CudaRadixCiphertextFFI rp;
+    as_radix_ciphertext_slice<Torus>(&rp, mem_ptr->batch_cmp_packed,
+                                     half + K * packed_per_pair,
+                                     half + (K + 1) * packed_per_pair);
+    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), &rp, values[l],
+                       N, message_modulus);
+    K++;
+  }
+
+  uint32_t total_packed = K * packed_per_pair;
+
+  // Identity PBS to clean noise after packing.
+  CudaRadixCiphertextFFI packed_view;
+  as_radix_ciphertext_slice<Torus>(&packed_view, mem_ptr->batch_cmp_packed, 0,
+                                   2 * total_packed);
+  integer_radix_apply_univariate_lookup_table<Torus>(
+      streams, &packed_view, &packed_view, bsks, ksks,
+      mem_ptr->batch_identity_lut, 2 * total_packed);
+
+  // Raw LWE subtract: cmp = left - right.
+  CudaRadixCiphertextFFI left_half, right_half, cmp_view;
+  as_radix_ciphertext_slice<Torus>(&left_half, mem_ptr->batch_cmp_packed, 0,
+                                   total_packed);
+  as_radix_ciphertext_slice<Torus>(&right_half, mem_ptr->batch_cmp_packed, half,
+                                   half + total_packed);
+  as_radix_ciphertext_slice<Torus>(&cmp_view, mem_ptr->batch_cmp_comparisons, 0,
+                                   total_packed);
+
+  host_subtraction<Torus>(
+      streams.stream(0), streams.gpu_index(0),
+      static_cast<Torus *>(cmp_view.ptr), static_cast<Torus *>(left_half.ptr),
+      static_cast<Torus *>(right_half.ptr), big_lwe_dimension, total_packed);
+
+  // Map diff to {0=INF, 1=EQ, 2=SUP} via is_non_zero + scalar one.
+  integer_radix_apply_univariate_lookup_table<Torus>(
+      streams, &cmp_view, &cmp_view, bsks, ksks, mem_ptr->batch_is_non_zero_lut,
+      total_packed);
+  host_add_scalar_one_inplace<Torus>(streams, &cmp_view, message_modulus,
+                                     params.carry_modulus);
+
+  std::function<Torus(Torus)> identity_f = [](Torus x) -> Torus { return x; };
+  batched_tree_sign_reduction<Torus>(streams, mem_ptr->comparison_results,
+                                     &cmp_view, K, packed_per_pair, mem_ptr,
+                                     bsks, ksks, identity_f);
+}
+
+// Phase 1 of a sub-step: produce K comparison signs. Batched when unsigned,
+// sequential host_difference_check fallback when signed.
+template <typename Torus, typename KSTorus>
+__host__ void host_bitonic_sort_compare_phase(
+    CudaStreams streams, CudaRadixCiphertextFFI **values, uint32_t num_values,
+    uint32_t k_param, uint32_t j_param, int_bitonic_sort_buffer<Torus> *mem_ptr,
+    void *const *bsks, KSTorus *const *ksks) {
+
+  auto N = mem_ptr->num_radix_blocks;
+
+  if (!mem_ptr->is_signed) {
+    host_batched_unsigned_comparison<Torus>(
+        streams, values, num_values, k_param, j_param, mem_ptr, bsks, ksks);
+  } else {
+    auto cmp_mem = mem_ptr->comparison_mem;
+    uint32_t pair_idx = 0;
+    for (uint32_t i = 0; i < num_values; i++) {
+      uint32_t l = i ^ j_param;
+      if (l <= i)
+        continue;
+      host_difference_check<Torus>(streams, cmp_mem->tmp_lwe_array_out,
+                                   values[i], values[l], cmp_mem,
+                                   cmp_mem->identity_lut_f, bsks, ksks, N);
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams.stream(0), streams.gpu_index(0), mem_ptr->comparison_results,
+          pair_idx, pair_idx + 1, cmp_mem->tmp_lwe_array_out, 0, 1);
+      pair_idx++;
+    }
+  }
+}
+
+// Phase 2a: for each compare-and-swap pair, copy (min, max) candidates into
+// batch_buffer_in (2N blocks per pair on each side) and broadcast the pair's
+// sign into batch_condition. Returns K (number of pairs) and half (= K * 2N).
+template <typename Torus>
+__host__ void host_bitonic_sort_gather_cmux_batch(
+    CudaStreams streams, CudaRadixCiphertextFFI **values, uint32_t num_values,
+    uint32_t k_param, uint32_t j_param, int32_t direction,
+    int_bitonic_sort_buffer<Torus> *mem_ptr, uint32_t &K_out,
+    uint32_t &half_out) {
+
+  auto N = mem_ptr->num_radix_blocks;
+  uint32_t blocks_per_pair = 2 * N;
+
+  uint32_t K = 0;
+  for (uint32_t i = 0; i < num_values; i++) {
+    if ((i ^ j_param) > i)
+      K++;
+  }
+  uint32_t half = K * blocks_per_pair;
+
+  uint32_t pair_idx = 0;
+  for (uint32_t i = 0; i < num_values; i++) {
+    uint32_t l = i ^ j_param;
+    if (l <= i)
+      continue;
+
+    // Ascending iff bit k_param of i is zero, flip when sorting descending.
+    bool ascending = ((i & k_param) == 0);
+    if (direction == 0)
+      ascending = !ascending;
+
+    uint32_t base = pair_idx * blocks_per_pair;
+    CudaRadixCiphertextFFI *min_t = ascending ? values[l] : values[i];
+    CudaRadixCiphertextFFI *max_t = ascending ? values[i] : values[l];
+    CudaRadixCiphertextFFI *min_f = ascending ? values[i] : values[l];
+    CudaRadixCiphertextFFI *max_f = ascending ? values[l] : values[i];
+
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), mem_ptr->batch_buffer_in, base,
+        base + N, min_t, 0, N);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), mem_ptr->batch_buffer_in,
+        base + N, base + blocks_per_pair, max_t, 0, N);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), mem_ptr->batch_buffer_in,
+        half + base, half + base + N, min_f, 0, N);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), mem_ptr->batch_buffer_in,
+        half + base + N, half + base + blocks_per_pair, max_f, 0, N);
+    for (uint32_t b = 0; b < blocks_per_pair; b++) {
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams.stream(0), streams.gpu_index(0), mem_ptr->batch_condition,
+          base + b, base + b + 1, mem_ptr->comparison_results, pair_idx,
+          pair_idx + 1);
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams.stream(0), streams.gpu_index(0), mem_ptr->batch_condition,
+          half + base + b, half + base + b + 1, mem_ptr->comparison_results,
+          pair_idx, pair_idx + 1);
+    }
+    pair_idx++;
+  }
+
+  K_out = K;
+  half_out = half;
+}
+
+// Phase 2b: run the cmux PBS, sum the selected halves, msg-extract, and
+// scatter results back into values[].
+template <typename Torus, typename KSTorus>
+__host__ void host_bitonic_sort_apply_cmux_batch(
+    CudaStreams streams, CudaRadixCiphertextFFI **values, uint32_t num_values,
+    uint32_t j_param, uint32_t K, uint32_t half,
+    int_bitonic_sort_buffer<Torus> *mem_ptr, void *const *bsks,
+    KSTorus *const *ksks) {
+
+  auto N = mem_ptr->num_radix_blocks;
+  auto params = mem_ptr->params;
+  uint32_t blocks_per_pair = 2 * N;
+  uint32_t total_bivariate = 2 * half;
+
+  integer_radix_apply_bivariate_lookup_table<Torus>(
+      streams, mem_ptr->batch_buffer_out, mem_ptr->batch_buffer_in,
+      mem_ptr->batch_condition, bsks, ksks, mem_ptr->batch_predicate_lut,
+      total_bivariate, params.message_modulus);
+
+  // Sum the two halves: one side holds the selected value, the other is zero.
+  CudaRadixCiphertextFFI true_half, false_half;
+  as_radix_ciphertext_slice<Torus>(&true_half, mem_ptr->batch_buffer_out, 0,
+                                   half);
+  as_radix_ciphertext_slice<Torus>(&false_half, mem_ptr->batch_buffer_out, half,
+                                   total_bivariate);
+  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &true_half,
+                       &true_half, &false_half, half, params.message_modulus,
+                       params.carry_modulus);
+
+  CudaRadixCiphertextFFI extract_out;
+  as_radix_ciphertext_slice<Torus>(&extract_out, mem_ptr->batch_buffer_out, 0,
+                                   half);
+  integer_radix_apply_univariate_lookup_table<Torus>(
+      streams, &extract_out, &true_half, bsks, ksks,
+      mem_ptr->batch_message_extract_lut, half);
+
+  uint32_t pair_idx = 0;
+  for (uint32_t i = 0; i < num_values; i++) {
+    uint32_t l = i ^ j_param;
+    if (l <= i)
+      continue;
+    uint32_t base = pair_idx * blocks_per_pair;
+    copy_radix_ciphertext_slice_async<Torus>(streams.stream(0),
+                                             streams.gpu_index(0), values[i], 0,
+                                             N, &extract_out, base, base + N);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), values[l], 0, N, &extract_out,
+        base + N, base + blocks_per_pair);
+    pair_idx++;
+  }
+}
+
+// One sub-step of the bitonic network for fixed (k_param, j_param).
+template <typename Torus, typename KSTorus>
+__host__ void
+host_bitonic_sort_substep(CudaStreams streams, CudaRadixCiphertextFFI **values,
+                          uint32_t num_values, uint32_t k_param,
+                          uint32_t j_param, int32_t direction,
+                          int_bitonic_sort_buffer<Torus> *mem_ptr,
+                          void *const *bsks, KSTorus *const *ksks) {
+
+  host_bitonic_sort_compare_phase<Torus>(streams, values, num_values, k_param,
+                                         j_param, mem_ptr, bsks, ksks);
+
+  uint32_t K, half;
+  host_bitonic_sort_gather_cmux_batch<Torus>(streams, values, num_values,
+                                             k_param, j_param, direction,
+                                             mem_ptr, K, half);
+
+  host_bitonic_sort_apply_cmux_batch<Torus>(
+      streams, values, num_values, j_param, K, half, mem_ptr, bsks, ksks);
+}
+
+template <typename Torus, typename KSTorus>
+__host__ void
+host_bitonic_sort(CudaStreams streams, CudaRadixCiphertextFFI **values,
+                  uint32_t num_values, int_bitonic_sort_buffer<Torus> *mem_ptr,
+                  void *const *bsks, KSTorus *const *ksks, int32_t direction) {
+
+  for (uint32_t k = 2; k <= num_values; k <<= 1)
+    for (uint32_t j = k >> 1; j > 0; j >>= 1)
+      host_bitonic_sort_substep<Torus>(streams, values, num_values, k, j,
+                                       direction, mem_ptr, bsks, ksks);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -420,6 +420,39 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
  }
 }

+// Noise tests variant: identical to host_cg_multi_bit_programmable_bootstrap
+// but uses NOISE_TESTS keybundle mode.
+template <typename Torus, class params>
+__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with NOISE_TESTS mode instead of GENERIC
+    execute_compute_keybundle_noise_tests<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    execute_cg_external_product_loop<Torus, params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+        lut_stride);
+  }
+}
+
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -645,6 +645,103 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
  *buffer = nullptr;
 }

+// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
+// that callers using the noise-tests PBS variant have a consistent API.
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+  return scratch_cuda_multi_bit_programmable_bootstrap_64_async(
+      stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+      level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
+  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
+                                                   pbs_buffer);
+}
+
+// Noise tests variant of the 64-bit multi-bit PBS, restricted to
+// polynomial_size=2048. The main difference is that the input
+// is assumed to be modulus switched before bootstrapping.
+void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(num_samples == 1,
+                 "Cuda error (multi-bit PBS): num_samples (%d) should be 1",
+                 num_samples);
+
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
+  PANIC_IF_FALSE(polynomial_size == 2048,
+                 "Cuda error (multi-bit PBS noise tests): only polynomial "
+                 "size 2048 is supported, got %d.",
+                 polynomial_size);
+
+  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
+      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::TBC:
+#if CUDA_ARCH >= 900
+  {
+    host_tbc_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
+                                                          Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+  } break;
+#else
+    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
+#endif
+  case PBS_VARIANT::CG:
+    host_cg_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
+                                                         Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    host_multi_bit_programmable_bootstrap_noise_tests<uint64_t, Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
 /**
 * Computes divisors of the product of num_sms (streaming multiprocessors on the
 * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -25,7 +25,8 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
         level_count;
 }

-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename Torus, class params, sharedMemDegree SMD,
+          bool runs_noise_test = false>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -55,9 +56,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {

-    const Torus *block_lwe_array_in =
-        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
-
    double2 *keybundle = keybundle_array +
                         // select the input
                         input_idx * keybundle_size_per_input;
@@ -86,10 +84,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      if constexpr (runs_noise_test == true) {
+        // For noise tests the input array contains the input lwe but also the
+        // modswitched results. This allows to avoid changing the accumulation
+        // kernel for the noise tests since the input body will stay in the same
+        // position. The layout of the input array is the following:
+        // | input lwe     | modswitched inputs       |
+        // | lwe size      | lwe_size*grouping_factor |
+
+        // This offset allows to jump directly to the modswitched inputs,
+        // skipping the input lwe
+        const Torus modswitched_offset = lwe_dimension + 1;
+
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] *
+                              (lwe_dimension / grouping_factor) *
+                              (1 << grouping_factor) +
+                          modswitched_offset];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
+        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
+
+      } else {
+        // In production we calculate the monomial degrees on the fly, since
+        // they are not stored in the input array.
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+        monomial_degrees[threadIdx.x] =
+            calculates_monomial_degree<Torus, params>(
+                lwe_array_group, threadIdx.x, grouping_factor);
+      }
    }
    __syncthreads();

@@ -145,7 +173,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 // Then we can just calculate the offset needed to apply this coefficients, and
 // the operation transforms into a pointwise vector multiplication, avoiding to
 // perform extra instructions other than MADD
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename Torus, class params, sharedMemDegree SMD,
+          bool runs_noise_test = false>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -219,10 +248,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;

    if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      if constexpr (runs_noise_test == true) {
+        // For noise tests the input array contains the input lwe but also the
+        // modswitched results. This allows to avoid changing the accumulation
+        // kernel for the noise tests since the input body will stay in the same
+        // position. The layout of the input array is the following:
+        // | input lwe     | modswitched inputs       |
+        // | lwe size      | lwe_size*grouping_factor |
+
+        // This offset allows to jump directly to the modswitched inputs,
+        // skipping the input lwe
+        const Torus modswitched_offset = lwe_dimension + 1;
+
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] *
+                              (lwe_dimension / grouping_factor) *
+                              (1 << grouping_factor) +
+                          modswitched_offset];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
+        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
+
+      } else {
+        // In production we calculate the monomial degrees on the fly, since
+        // they are not stored in the input array.
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+        monomial_degrees[threadIdx.x] =
+            calculates_monomial_degree<Torus, params>(
+                lwe_array_group, threadIdx.x, grouping_factor);
+      }
    }
    __syncthreads();

@@ -662,6 +721,7 @@ enum class MultiBitKeybundleLaunchMode {
  AUTO,
  GENERIC,
  SPECIALIZED_2_2,
+  NOISE_TESTS,
 };

 template <typename Torus, class params>
@@ -726,30 +786,65 @@ __host__ void execute_compute_keybundle_with_mode(
    bool use_specialized =
        launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
        (launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
+         can_use_specialized) ||
+        (launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS &&
         can_use_specialized);
+    bool use_noise_test_template =
+        launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS;
    if (use_specialized) {
      dim3 thds_new_keybundle(512, 1, 1);
-      check_cuda_error(cudaFuncSetAttribute(
-          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
-          cudaFuncCachePreferShared));
-      check_cuda_error(cudaGetLastError());
-      device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-          Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
-                                         3 * full_sm_keybundle, stream>>>(
-          lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-          lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
+      if (use_noise_test_template) {
+        // Set up the noise-test variant of the specialized 2_2 kernel
+        check_cuda_error(cudaFuncSetAttribute(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM, true>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            3 * full_sm_keybundle));
+        check_cuda_error(cudaFuncSetCacheConfig(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM, true>,
+            cudaFuncCachePreferShared));
+        check_cuda_error(cudaGetLastError());
+        device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+            Torus, Degree<2048>, FULLSM, true>
+            <<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
+               stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
+                         bootstrapping_key, lwe_dimension, lwe_offset,
+                         chunk_size, keybundle_size_per_input);
+      } else {
+        check_cuda_error(cudaFuncSetAttribute(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            3 * full_sm_keybundle));
+        check_cuda_error(cudaFuncSetCacheConfig(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM>,
+            cudaFuncCachePreferShared));
+        check_cuda_error(cudaGetLastError());
+        device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+            Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
+                                           3 * full_sm_keybundle, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
+      }
    } else {
-      device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
-          <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
-              lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-              lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-              level_count, lwe_offset, chunk_size, keybundle_size_per_input,
-              d_mem, 0);
+      if (use_noise_test_template) {
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM,
+                                                          true>
+            <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+                lwe_array_in, lwe_input_indexes, keybundle_fft,
+                bootstrapping_key, lwe_dimension, glwe_dimension,
+                polynomial_size, grouping_factor, level_count, lwe_offset,
+                chunk_size, keybundle_size_per_input, d_mem, 0);
+      } else {
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
+            <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+                lwe_array_in, lwe_input_indexes, keybundle_fft,
+                bootstrapping_key, lwe_dimension, glwe_dimension,
+                polynomial_size, grouping_factor, level_count, lwe_offset,
+                chunk_size, keybundle_size_per_input, d_mem, 0);
+      }
    }
  }
  check_cuda_error(cudaGetLastError());
@@ -796,6 +891,20 @@ __host__ void execute_compute_keybundle_2_2_specialized(
      grouping_factor, level_count, lwe_offset,
      MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
 }
+// Used only to run noise tests
+template <typename Torus, class params>
+__host__ void execute_compute_keybundle_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
+  execute_compute_keybundle_with_mode<Torus, params>(
+      stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+      buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+      grouping_factor, level_count, lwe_offset,
+      MultiBitKeybundleLaunchMode::NOISE_TESTS);
+}

 template <typename Torus, class params, bool is_first_iter>
 __host__ void execute_step_one(
@@ -955,4 +1064,62 @@ __host__ void host_multi_bit_programmable_bootstrap(
    }
  }
 }
+
+template <typename Torus, class params>
+__host__ void host_multi_bit_programmable_bootstrap_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with NOISE_TESTS mode to enable the specialized
+    // runs_noise_test=true kernel variant for noise measurement
+    execute_compute_keybundle_with_mode<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset,
+        MultiBitKeybundleLaunchMode::NOISE_TESTS);
+    // Accumulate (same as standard path)
+    uint32_t chunk_size =
+        std::min((uint32_t)lwe_chunk_size,
+                 (lwe_dimension / grouping_factor) - lwe_offset);
+    for (uint32_t j = 0; j < chunk_size; j++) {
+      bool is_first_iter = (j + lwe_offset) == 0;
+      bool is_last_iter =
+          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+      if (is_first_iter) {
+        execute_step_one<Torus, params, true>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count);
+      } else {
+        execute_step_one<Torus, params, false>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count);
+      }
+
+      if (is_last_iter) {
+        execute_step_two<Torus, params, true>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      } else {
+        execute_step_two<Torus, params, false>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      }
+    }
+  }
+}
 #endif // MULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -293,6 +293,81 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
  *buffer = nullptr;
 }

+// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
+// that callers using the noise-tests PBS128 variant have a consistent API.
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+  return scratch_cuda_multi_bit_programmable_bootstrap_128_async(
+      stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+      level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
+  cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
+                                                    pbs_buffer);
+  cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
+}
+
+// Noise tests variant of the 128-bit multi-bit PBS, restricted to
+// polynomial_size=2048. The input is assumed to contain precomputed
+// modswitched values in the extended input array layout.
+void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(num_samples == 1,
+                 "Cuda error (multi-bit PBS): num_samples (%d) should be 1",
+                 num_samples);
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
+  PANIC_IF_FALSE(polynomial_size == 2048,
+                 "Cuda error (multi-bit PBS128 noise tests): only polynomial "
+                 "size 2048 is supported, got %d.",
+                 polynomial_size);
+
+  auto *buffer =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::CG:
+    host_cg_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
+                                                             Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    host_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
+                                                          Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
 /**
 * Computes divisors of the product of num_sms (streaming multiprocessors on the
 * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -18,7 +18,8 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
                                      (size_t)2); // accumulator
 }

-template <typename InputTorus, class params, sharedMemDegree SMD>
+template <typename InputTorus, class params, sharedMemDegree SMD,
+          bool runs_noise_test = false>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
    const InputTorus *__restrict__ lwe_array_in,
    const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
@@ -80,11 +81,35 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      auto lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] =
-          calculates_monomial_degree<InputTorus, params>(
-              lwe_array_group, threadIdx.x, grouping_factor);
+      if constexpr (runs_noise_test == true) {
+        // For noise tests the input array contains the input lwe but also the
+        // modswitched results. This allows to avoid changing the accumulation
+        // kernel for the noise tests since the input body will stay in the same
+        // position. The layout of the input array is the following:
+        // | input lwe     | modswitched inputs       |
+        // | lwe size      | lwe_size*grouping_factor |
+
+        // This offset allows to jump directly to the modswitched inputs,
+        // skipping the input lwe
+        const InputTorus modswitched_offset = lwe_dimension + 1;
+
+        const InputTorus *block_lwe_array_in_noise =
+            &lwe_array_in[lwe_input_indexes[input_idx] *
+                              (lwe_dimension / grouping_factor) *
+                              (1 << grouping_factor) +
+                          modswitched_offset];
+
+        const InputTorus *lwe_array_group =
+            block_lwe_array_in_noise +
+            rev_lwe_iteration * (1 << grouping_factor);
+        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
+      } else {
+        auto lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+        monomial_degrees[threadIdx.x] =
+            calculates_monomial_degree<InputTorus, params>(
+                lwe_array_group, threadIdx.x, grouping_factor);
+      }
    }
    __syncthreads();

@@ -588,6 +613,74 @@ __host__ void execute_compute_keybundle_128(
  check_cuda_error(cudaGetLastError());
 }

+// Used only to run noise tests: launches the keybundle kernel with the
+// runs_noise_test=true variant, which reads modswitched inputs from the
+// extended input array layout instead of computing them on-the-fly
+template <typename InputTorus, class params>
+__host__ void execute_compute_keybundle_noise_tests_128(
+    cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
+  cuda_set_device(gpu_index);
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  uint64_t chunk_size = std::min(
+      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
+
+  uint64_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2) * 4;
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+          __uint128_t>(polynomial_size);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  auto d_mem = buffer->d_mem_keybundle;
+  auto keybundle_fft = buffer->keybundle_fft;
+
+  dim3 grid_keybundle(num_samples * chunk_size,
+                      (glwe_dimension + 1) * (glwe_dimension + 1), level_count);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncCachePreferShared));
+    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
+                                                          NOSM, true>
+        <<<grid_keybundle, thds, 0, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, full_sm_keybundle);
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncCachePreferShared));
+    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
+                                                          FULLSM, true>
+        <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename InputTorus, class params, bool is_first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
@@ -1200,4 +1293,96 @@ supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
  }
 }

+// Noise tests variant: identical to
+// host_cg_multi_bit_programmable_bootstrap_128 but uses the noise-test
+// keybundle (runs_noise_test=true) instead of the standard one.
+template <typename InputTorus, class params>
+__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with the noise-test kernel variant
+    // (runs_noise_test=true) to read precomputed modswitched values
+    execute_compute_keybundle_noise_tests_128<InputTorus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    execute_cg_external_product_loop_128<InputTorus, params>(
+        stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+        lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        lwe_offset, num_many_lut, lut_stride);
+  }
+}
+
+template <typename InputTorus, class params>
+__host__ void host_multi_bit_programmable_bootstrap_noise_tests_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with the noise-test kernel variant
+    // (runs_noise_test=true) to read precomputed modswitched values
+    execute_compute_keybundle_noise_tests_128<InputTorus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    // Accumulate (same as standard path)
+    uint64_t chunk_size =
+        std::min((uint32_t)lwe_chunk_size,
+                 (lwe_dimension / grouping_factor) - lwe_offset);
+    for (uint32_t j = 0; j < chunk_size; j++) {
+      bool is_first_iter = (j + lwe_offset) == 0;
+      bool is_last_iter =
+          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+      if (is_first_iter) {
+        execute_step_one_128<InputTorus, params, true>(
+            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            base_log, level_count);
+      } else {
+        execute_step_one_128<InputTorus, params, false>(
+            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            base_log, level_count);
+      }
+
+      if (is_last_iter) {
+        execute_step_two_128<InputTorus, params, true>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      } else {
+        execute_step_two_128<InputTorus, params, false>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      }
+    }
+  }
+}
+
 #endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -795,6 +795,40 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
      MultiBitTbcLaunchMode::SPECIALIZED_2_2);
 }

+// Noise tests variant: uses NOISE_TESTS keybundle mode for the keybundle step
+// while keeping the standard AUTO accumulate behaviour for the TBC loop.
+template <typename Torus, class params>
+__host__ void host_tbc_multi_bit_programmable_bootstrap_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+  cuda_set_device(gpu_index);
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Keybundle with NOISE_TESTS mode; the TBC accumulate uses AUTO as usual
+    execute_compute_keybundle_noise_tests<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    // Accumulate (unchanged from standard TBC path)
+    execute_tbc_external_product_loop<Torus, params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+        lut_stride, MultiBitTbcLaunchMode::AUTO);
+  }
+}
+
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t max_shared_memory) {
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -119,71 +119,73 @@ __host__ void host_expand_without_verification(
      streams.stream(0), streams.gpu_index(0), true);

  if (mem_ptr->expand_kind == EXPAND_KIND::NO_CASTING) {
+    // This path is added to mimic the CPU fallback behaviour for the no_casting
+    // expand, which is needed for the noise sanity checks.
    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
                                   lwe_array_out, d_expand_jobs, num_lwes);
-    return;
-  }

-  host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
-                                 expanded_lwes, d_expand_jobs, num_lwes);
+  } else {
+    // This is our default path for the expand with casting if needed.
+    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
+                                   expanded_lwes, d_expand_jobs, num_lwes);

-  auto lwe_array_input = expanded_lwes;
-  auto ksks = casting_keys;
-  auto message_and_carry_extract_luts = mem_ptr->message_and_carry_extract_luts;
+    auto lwe_array_input = expanded_lwes;
+    auto ksks = casting_keys;
+    auto message_and_carry_extract_luts =
+        mem_ptr->message_and_carry_extract_luts;

-  auto lut = mem_ptr->message_and_carry_extract_luts;
-  if (casting_key_type == SMALL_TO_BIG) {
-    if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
+    auto lut = mem_ptr->message_and_carry_extract_luts;
+    if (casting_key_type == SMALL_TO_BIG) {
+      if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
+      }
+      // Keyswitch from small to big key if needed
+      auto ksed_small_to_big_expanded_lwes =
+          mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
+      std::vector<Torus *> lwe_trivial_indexes_vec =
+          lut->lwe_trivial_indexes_vec;
+
+      auto casting_params = mem_ptr->casting_params;
+      auto casting_output_dimension = casting_params.big_lwe_dimension;
+      auto casting_input_dimension = casting_params.small_lwe_dimension;
+      auto casting_ks_level = casting_params.ks_level;
+      auto casting_ks_base_log = casting_params.ks_base_log;
+
+      // apply keyswitch to BIG
+      execute_keyswitch_async<Torus>(
+          streams.get_ith(0), ksed_small_to_big_expanded_lwes,
+          lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
+          casting_keys, casting_input_dimension, casting_output_dimension,
+          casting_ks_base_log, casting_ks_level, num_lwes,
+          lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
+
+      // In this case, the next keyswitch will use the compute ksk
+      ksks = compute_ksks;
+      lwe_array_input = ksed_small_to_big_expanded_lwes;
    }
-    // Keyswitch from small to big key if needed
-    auto ksed_small_to_big_expanded_lwes =
-        mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
-    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-    auto casting_params = mem_ptr->casting_params;
-    auto casting_output_dimension = casting_params.big_lwe_dimension;
-    auto casting_input_dimension = casting_params.small_lwe_dimension;
-    auto casting_ks_level = casting_params.ks_level;
-    auto casting_ks_base_log = casting_params.ks_base_log;
+    // Apply LUT
+    cuda_memset_async(lwe_array_out, 0,
+                      safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
+                                             (size_t)num_lwes, (size_t)2),
+                      streams.stream(0), streams.gpu_index(0));
+    CudaRadixCiphertextFFI output;
+    into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
+    CudaRadixCiphertextFFI input;
+    into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
+    // This is a special case only for our noise sanity checks
+    // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
+    // This replicates the CPU fallback behaviour of the casting expand
+    auto final_lut = (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK
+                          ? mem_ptr->identity_lut
+                          : message_and_carry_extract_luts);

-    // apply keyswitch to BIG
-    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), ksed_small_to_big_expanded_lwes,
-        lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
-        casting_keys, casting_input_dimension, casting_output_dimension,
-        casting_ks_base_log, casting_ks_level, num_lwes,
-        lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
-
-    // In this case, the next keyswitch will use the compute ksk
-    ksks = compute_ksks;
-    lwe_array_input = ksed_small_to_big_expanded_lwes;
-  }
-
-  // Apply LUT
-  cuda_memset_async(lwe_array_out, 0,
-                    safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
-                                           (size_t)num_lwes, (size_t)2),
-                    streams.stream(0), streams.gpu_index(0));
-  CudaRadixCiphertextFFI output;
-  into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
-  CudaRadixCiphertextFFI input;
-  into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
-  // This is a special case only for our noise sanity checks
-  // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
-  // This replicates the CPU fallback behaviour of the casting expand
-  if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
    integer_radix_apply_univariate_lookup_table<Torus>(
-        streams, &output, &input, bsks, ksks, mem_ptr->identity_lut,
-        2 * num_lwes);
-    return;
-  }
+        streams, &output, &input, bsks, ksks, final_lut, 2 * num_lwes);

-  integer_radix_apply_univariate_lookup_table<Torus>(
-      streams, &output, &input, bsks, ksks, message_and_carry_extract_luts,
-      2 * num_lwes);
-  release_cpu_radix_ciphertext_async(&input);
-  release_cpu_radix_ciphertext_async(&output);
+    release_cpu_radix_ciphertext_async(&input);
+    release_cpu_radix_ciphertext_async(&output);
+  }
  compact_lwe_lists.release();
 }

--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -79,6 +79,30 @@ unsafe extern "C" {
        polynomial_size: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_modulus_switch_multi_bit_64_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *mut ffi::c_void,
+        size: u32,
+        log_modulus: u32,
+        degree: u32,
+        grouping_factor: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_modulus_switch_multi_bit_128_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *mut ffi::c_void,
+        size: u32,
+        log_modulus: u32,
+        degree: u32,
+        grouping_factor: u32,
+    );
+}
 pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
 pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
 pub type PBS_TYPE = ffi::c_uint;
@@ -112,9 +136,6 @@ pub type Direction = ffi::c_uint;
 pub const BitValue_Zero: BitValue = 0;
 pub const BitValue_One: BitValue = 1;
 pub type BitValue = ffi::c_uint;
-pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
-pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
-pub type RERAND_MODE = ffi::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct CudaStreamsFFI {
@@ -2311,6 +2332,46 @@ unsafe extern "C" {
 unsafe extern "C" {
    pub fn cleanup_cuda_cast_to_signed_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_bitonic_sort_64_async(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_radix_blocks: u32,
+        num_values: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        is_signed: bool,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_bitonic_sort_64_async(
+        streams: CudaStreamsFFI,
+        values: *mut *mut CudaRadixCiphertextFFI,
+        num_values: u32,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+        direction: i32,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_bitonic_sort_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_integer_compress_radix_ciphertext_64_async(
        streams: CudaStreamsFFI,
@@ -2455,6 +2516,9 @@ unsafe extern "C" {
        glwe_index: u32,
    );
 }
+pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
+pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
+pub type RERAND_MODE = ffi::c_uint;
 unsafe extern "C" {
    pub fn scratch_cuda_rerand_64_async(
        streams: CudaStreamsFFI,
@@ -2467,7 +2531,7 @@ unsafe extern "C" {
        message_modulus: u32,
        carry_modulus: u32,
        allocate_gpu_memory: bool,
-        rerand_type: u32,
+        rerand_type: RERAND_MODE,
    ) -> u64;
 }
 unsafe extern "C" {
@@ -3367,6 +3431,48 @@ unsafe extern "C" {
        pbs_buffer: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_async(
        stream: *mut ffi::c_void,
@@ -3408,3 +3514,44 @@ unsafe extern "C" {
        buffer: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
--- a/backends/tfhe-hpu-backend/LICENSE
+++ b/backends/tfhe-hpu-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-hpu-backend/README.md
+++ b/backends/tfhe-hpu-backend/README.md
@@ -297,8 +297,8 @@ source setup_hpu.sh --config v80 -p
 # Run hlapi benches
 make test_high_level_api_hpu

-# Run hlapi erc20 benches
-make bench_hlapi_erc20_hpu 
+# Run hlapi erc7984 benches
+make bench_hlapi_erc7984_hpu 

 # Run integer level benches
 make bench_integer_hpu
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
@@ -109,7 +109,7 @@
  flush_behaviour = "Patient"
  flush = true

-[firmware.op_cfg.by_op.ERC_20]
+[firmware.op_cfg.by_op.ERC_7984]
  fill_batch_fifo = true
  min_batch_size = false
  use_tiers = true
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -121,7 +121,7 @@
  flush_behaviour = "Patient"
  flush = true

-[firmware.op_cfg.by_op.ERC_20]
+[firmware.op_cfg.by_op.ERC_7984]
  fill_batch_fifo = true
  min_batch_size = false
  use_tiers = true
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -230,7 +230,7 @@ iop!(
    [IOP_CMP -> "CMP_NEQ", opcode::CMP_NEQ],
    [IOP_CT_F_CT_BOOL -> "IF_THEN_ZERO", opcode::IF_THEN_ZERO],
    [IOP_CT_F_2CT_BOOL -> "IF_THEN_ELSE", opcode::IF_THEN_ELSE],
-    [IOP_2CT_F_3CT -> "ERC_20", opcode::ERC_20],
+    [IOP_2CT_F_3CT -> "ERC_7984", opcode::ERC_7984],
    [IOP_CT_F_CT -> "MEMCPY", opcode::MEMCPY],
    [IOP_CT_F_CT -> "ILOG2", opcode::ILOG2],
    [IOP_CT_F_CT -> "COUNT0", opcode::COUNT0],
@@ -240,5 +240,5 @@ iop!(
    [IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
    [IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
    [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
-    [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
+    [IOP_2NCT_F_3NCT -> "ERC_7984_SIMD", opcode::ERC_7984_SIMD],
 );
--- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
@@ -74,9 +74,9 @@ pub const IF_THEN_ZERO: u8 = 0xCA;
 pub const IF_THEN_ELSE: u8 = 0xCB;

 // Custom algorithm
-// ERC20 -> Found xfer algorithm
+// ERC7984 -> Found xfer algorithm
 // 2Ct <- func(3Ct)
-pub const ERC_20: u8 = 0x80;
+pub const ERC_7984: u8 = 0x80;

 // Count bits
 pub const COUNT0: u8 = 0x81;
@@ -89,7 +89,7 @@ pub const TRAIL1: u8 = 0x87;

 // SIMD for maximum throughput
 pub const ADD_SIMD: u8 = 0xF0;
-pub const ERC_20_SIMD: u8 = 0xF1;
+pub const ERC_7984_SIMD: u8 = 0xF1;
 //
 // Utility operations
 // Used to handle real clone of ciphertext already uploaded in the Hpu memory
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs
@@ -31,7 +31,7 @@ crate::impl_fw!("Demo" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_20 => fw_impl::ilp::iop_erc_20;
+    ERC_7984 => fw_impl::ilp::iop_erc_7984;

    CMP_GT  => cmp_gt;
    CMP_GTE => cmp_gte;
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
@@ -61,7 +61,7 @@ crate::impl_fw!("Ilp" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_20 => fw_impl::ilp::iop_erc_20;
+    ERC_7984 => fw_impl::ilp::iop_erc_7984;

    MEMCPY => fw_impl::ilp::iop_memcpy;

@@ -74,7 +74,7 @@ crate::impl_fw!("Ilp" [
    TRAIL1 => fw_impl::ilp_log::iop_trail1;
    // SIMD Implementations
    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
+    ERC_7984_SIMD  => fw_impl::llt::iop_erc_7984_simd;
 ]);

 #[instrument(level = "trace", skip(prog))]
@@ -1296,13 +1296,13 @@ pub fn iop_if_then_else(prog: &mut Program) {
        });
 }

-/// Implement erc_20 fund xfer
+/// Implement erc_7984 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
 /// 3. Compute new amount (from - new_amount, to + new_amount)
 #[instrument(level = "info", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
+pub fn iop_erc_7984(prog: &mut Program) {
    // Allocate metavariables:
    // Dest -> Operand
    let mut dst_from = prog.iop_template_var(OperandKind::Dst, 0);
@@ -1314,7 +1314,7 @@ pub fn iop_erc_20(prog: &mut Program) {
    let src_amount = prog.iop_template_var(OperandKind::Src, 2);

    // Add Comment header
-    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());

    let props = prog.params();
    let tfhe_params: asm::DigitParameters = props.clone().into();
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -70,7 +70,7 @@ crate::impl_fw!("Llt" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_20 => fw_impl::llt::iop_erc_20;
+    ERC_7984 => fw_impl::llt::iop_erc_7984;
    MEMCPY => fw_impl::ilp::iop_memcpy;

    COUNT0 => fw_impl::ilp_log::iop_count0;
@@ -83,7 +83,7 @@ crate::impl_fw!("Llt" [

    // SIMD Implementations
    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
+    ERC_7984_SIMD  => fw_impl::llt::iop_erc_7984_simd;
 ]);

 // ----------------------------------------------------------------------------
@@ -225,24 +225,24 @@ pub fn iop_muls(prog: &mut Program) {
 }

 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
+pub fn iop_erc_7984(prog: &mut Program) {
    // Add Comment header
-    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
    // TODO: Make sweep of kogge_blk_w
    // All these little parameters would be very handy to write an
    // exploration/compilation program which would try to minimize latency by
    // playing with these.
-    iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
+    iop_erc_7984_rtl(prog, 0, Some(10)).add_to_prog(prog);
 }

 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_simd(prog: &mut Program) {
+pub fn iop_erc_7984_simd(prog: &mut Program) {
    // Add Comment header
-    prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_7984_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
    simd(
        prog,
        crate::asm::iop::SIMD_N,
-        fw_impl::llt::iop_erc_20_rtl,
+        fw_impl::llt::iop_erc_7984_rtl,
        None,
    );
 }
@@ -379,7 +379,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 // Helper Functions
 // ----------------------------------------------------------------------------

-/// Implement erc_20 fund xfer
+/// Implement erc_7984 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
@@ -391,7 +391,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 ///     (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
 /// Where N is the batch size
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
+pub fn iop_erc_7984_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
    // Allocate metavariables:
    // Dest -> Operand
    let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
--- a/ci/data_extractor/src/benchmark_specs.py
+++ b/ci/data_extractor/src/benchmark_specs.py
@@ -302,7 +302,7 @@ class BenchType(enum.StrEnum):

 class BenchSubset(enum.StrEnum):
    All = "all"
-    Erc20 = "erc20"
+    Erc7984 = "erc7984"
    Zk = "zk"

    @staticmethod
@@ -310,8 +310,8 @@ class BenchSubset(enum.StrEnum):
        match bench_subset.lower():
            case "all":
                return BenchSubset.All
-            case "erc20":
-                return BenchSubset.Erc20
+            case "erc7984":
+                return BenchSubset.Erc7984
            case "zk":
                return BenchSubset.Zk
            case _:
@@ -611,14 +611,14 @@ class BenchDetails:
                        # Case for arithmetic operations (add, sub, mul,...)
                        self.operation_name = "::".join(parts[2:-2])
                    else:
-                        # Case for higher-level operation (erc20 transfer, dex,...)
+                        # Case for higher-level operation (erc7984 transfer, dex,...)
                        self.operation_name = "::".join(parts[2:-1])
                else:
                    if "_PARAM_" in parts[-2]:
                        # Case for arithmetic operations (add, sub, mul,...)
                        self.operation_name = "::".join(parts[1:-2])
                    else:
-                        # Case for higher-level operation (erc20 transfer, dex,...)
+                        # Case for higher-level operation (erc7984 transfer, dex,...)
                        self.operation_name = "::".join(parts[1:-1])
                self.rust_type = parts[-1].partition("_mean")[0]
            case Layer.Shortint:
--- a/ci/data_extractor/src/data_extractor.py
+++ b/ci/data_extractor/src/data_extractor.py
@@ -137,7 +137,7 @@ parser.add_argument(
 parser.add_argument(
    "--bench-subset",
    dest="bench_subset",
-    choices=["all", "erc20", "zk"],
+    choices=["all", "erc7984", "zk"],
    default="all",
    help="Subset of benchmarks to filter against, dedicated formatting will be applied",
 )
@@ -285,8 +285,8 @@ def perform_hardware_comparison(

 def get_formatter(layer: Layer, bench_subset: BenchSubset):
    match bench_subset:
-        case BenchSubset.Erc20:
-            return formatters.hlapi.Erc20Formatter
+        case BenchSubset.Erc7984:
+            return formatters.hlapi.Erc7984Formatter
        case BenchSubset.Zk:
            if layer == Layer.Wasm:
                return formatters.wasm.ZKFormatter
@@ -442,7 +442,7 @@ def get_operands_types(layer: Layer, bench_subset: BenchSubset = None):
        return ciphertext_only
    elif bench_subset:
        match bench_subset:
-            case BenchSubset.Zk | BenchSubset.Erc20:
+            case BenchSubset.Zk | BenchSubset.Erc7984:
                return ciphertext_only
            case BenchSubset.All:
                return ciphertext_and_plaintext
--- a/ci/data_extractor/src/formatters/hlapi/hlapi.py
+++ b/ci/data_extractor/src/formatters/hlapi/hlapi.py
@@ -39,9 +39,9 @@ class HlApiFormatter(GenericFormatter):
 TRANSFER_IMPLEM_COLUMN_HEADER = "Transfer implementation"


-class Erc20Formatter(HlApiFormatter):
+class Erc7984Formatter(HlApiFormatter):
    """
-    Formatter for ERC20 benchmarks.
+    Formatter for ERC7984 benchmarks.
    """

    @staticmethod
@@ -63,7 +63,7 @@ class Erc20Formatter(HlApiFormatter):
                bench_type = BenchType.Latency
                conversion_func = utils.convert_latency_value_to_readable_text

-            # For now ERC20 benchmarks are only made on 64-bit ciphertexts.
+            # For now ERC7984 benchmarks are only made on 64-bit ciphertexts.
            value = conversion_func(timings[-1])
            formatted[test_name][bench_type] = value

--- a/ci/regression.toml
+++ b/ci/regression.toml
@@ -66,7 +66,7 @@ target.hlapi-dex = [
    "swap_claim::whitepaper",
    "swap_claim::no_cmux"
 ]
-target.hlapi-erc20 = ["transfer::whitepaper", "transfer::no_cmux"]
+target.hlapi-erc7984 = ["transfer::whitepaper", "transfer::no_cmux"]
 target.core_crypto-ks = ["keyswitch"]
 target.core_crypto-pbs = ["multi_bit_pbs"]
 parameters_filter = "PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128"
@@ -99,7 +99,7 @@ target.hlapi-dex = [
    "swap_claim::whitepaper",
    "swap_claim::no_cmux"
 ]
-target.hlapi-erc20 = ["transfer::whitepaper", "transfer::no_cmux"]
+target.hlapi-erc7984 = ["transfer::whitepaper", "transfer::no_cmux"]
 target.shortint = ["bitand"]
 target.core_crypto-ks = ["keyswitch"]
 target.core_crypto-pbs = [ "pbs_mem_optimized"]
--- a/mockups/tfhe-hpu-mockup/LICENSE
+++ b/mockups/tfhe-hpu-mockup/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/mockups/tfhe-hpu-mockup/README.md
+++ b/mockups/tfhe-hpu-mockup/README.md
@@ -164,7 +164,7 @@ cargo run --release --features="hpu" --test hpu -- u8
 # * bitwise: for ct x ct bitwise IOps
 # * cmp: for comparison IOps
 # * ternary: for if-then-else and like IOps
-# * algo: for IOps dedicated to offload sub-algorithm like ERC_20
+# * algo: for IOps dedicated to offload sub-algorithm like ERC_7984
 # Command below only run comparison IOps, for convenience, `just hpu_test "cmp"` could be also used
 cargo run --release --features="hpu" --test hpu -- cmp
 ```
--- a/scripts/check_scratch_cleanup.py
+++ b/scripts/check_scratch_cleanup.py
@@ -52,30 +52,31 @@ RUST_CALL_SITES = [
 # ---------------------------------------------------------------------------

 # Bindings parsed from bindings.rs
-# Scratch functions (equal to cleanup count)
-EXPECTED_SCRATCH_COUNT = 70
+# Scratch functions: Two more than cleanup functions because of
+#  'scratch_cuda_programmable_bootstrap_32_async' and
+EXPECTED_SCRATCH_COUNT = 73

 # Cuda operation functions
-EXPECTED_CUDA_COUNT = 109
+EXPECTED_CUDA_COUNT = 114

 # Cleanup functions
-EXPECTED_CLEANUP_COUNT = 70
+EXPECTED_CLEANUP_COUNT = 73

 # Check 3: Rust call-site scanning
 # Number of functions in ffi.rs files
-EXPECTED_CHECK3_RUST_FNS = 134
-# Number of async cuda calls found in ffi.rs functions
-EXPECTED_CHECK3_ASYNC_CUDA_CALLS = 90
+EXPECTED_CHECK3_RUST_FNS = 139
+# Number of functions in ffi.rs files that
+EXPECTED_CHECK3_ASYNC_CUDA_CALLS = 95

 # Number of instances of Rust calls to the scratch/cuda/cleanup in a
 # triplet sequence.
-EXPECTED_CHECK3_SCRATCH_CUDA_CLEANUP_TRIPLET_CALLS = 111
+EXPECTED_CHECK3_SCRATCH_CUDA_CLEANUP_TRIPLET_CALLS = 114

 # Check 5: Rust async-caller scanning
-EXPECTED_CHECK5_ASYNC_CALLERS = 118
+EXPECTED_CHECK5_ASYNC_CALLERS = 123

 # Check 6: Rust cleanup-caller scanning
-EXPECTED_CHECK6_CLEANUP_CALLERS = 108
+EXPECTED_CHECK6_CLEANUP_CALLERS = 111


 def check_paths_exist():
--- a/tests/backward_compatibility/high_level_api.rs
+++ b/tests/backward_compatibility/high_level_api.rs
@@ -13,7 +13,9 @@ use tfhe::shortint::prelude::LweDimension;
 use tfhe::shortint::{CarryModulus, CiphertextModulus, MessageModulus};
 use tfhe::xof_key_set::CompressedXofKeySet;
 #[cfg(feature = "zk-pok")]
-use tfhe::zk::{CompactPkeCrs, CompactPkeCrsConformanceParams};
+use tfhe::zk::new_compact_pke_crs_conformance_params;
+#[cfg(feature = "zk-pok")]
+use tfhe::zk::CompactPkeCrs;
 #[cfg(feature = "zk-pok")]
 use tfhe::ProvenCompactCiphertextList;
 use tfhe::{
@@ -175,7 +177,8 @@ pub fn test_zk_params(
            zk_scheme: loaded_crs.scheme_version().into(),
        };
        let conformance_params =
-            CompactPkeCrsConformanceParams::new(pke_params, loaded_crs.max_num_messages()).unwrap();
+            new_compact_pke_crs_conformance_params(pke_params, loaded_crs.max_num_messages())
+                .unwrap();

        loaded_crs.is_conformant(&conformance_params);
    }
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -92,8 +92,8 @@ harness = false
 required-features = ["integer", "internal-keycache", "pbs-stats"]

 [[bench]]
-name = "hlapi-erc20"
-path = "benches/high_level_api/erc20.rs"
+name = "hlapi-erc7984"
+path = "benches/high_level_api/erc7984.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]

--- a/tfhe-benchmark/LICENSE
+++ b/tfhe-benchmark/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/tfhe-benchmark/benches/high_level_api/dex.rs
+++ b/tfhe-benchmark/benches/high_level_api/dex.rs
@@ -253,7 +253,7 @@ mod pbs_stats {
        let _ = swap_request_update_dex_balance_func(&from_balance, &current_dex_balance, &amount);
        let count = tfhe::get_pbs_count() * 2;

-        println!("ERC20 swap request update dex balance/::{type_name}: {count} PBS");
+        println!("ERC7984 swap request update dex balance/::{type_name}: {count} PBS");

        let params = client_key.computation_parameters();
        let params_name = params.name();
@@ -308,7 +308,7 @@ mod pbs_stats {
        let (_, _) = swap_request_finalize_func(&to_balance_0, &total_dex_token_0_in, &sent_0);
        let count = tfhe::get_pbs_count() * 2;

-        println!("ERC20 swap request finalize/::{type_name}: {count} PBS");
+        println!("ERC7984 swap request finalize/::{type_name}: {count} PBS");

        let params = client_key.computation_parameters();
        let params_name = params.name();
@@ -373,7 +373,7 @@ mod pbs_stats {
        );
        let count = tfhe::get_pbs_count();

-        println!("ERC20 swap claim prepare/::{type_name}: {count} PBS");
+        println!("ERC7984 swap claim prepare/::{type_name}: {count} PBS");

        let params = client_key.computation_parameters();
        let params_name = params.name();
@@ -433,7 +433,7 @@ mod pbs_stats {
        );
        let count = tfhe::get_pbs_count() * 2;

-        println!("ERC20 swap claim update dex balance/::{type_name}: {count} PBS");
+        println!("ERC7984 swap claim update dex balance/::{type_name}: {count} PBS");

        let params = client_key.computation_parameters();
        let params_name = params.name();
--- a/tfhe-benchmark/benches/high_level_api/erc7984.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc7984.rs
@@ -252,8 +252,8 @@ where
        boolean: vec![],
        imm: vec![],
    };
-    let mut res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_20, src);
-    // Iop erc_20 return new_from, new_to
+    let mut res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_7984, src);
+    // Iop erc_7984 return new_from, new_to
    let new_to = res_handle.native.pop().unwrap();
    let new_from = res_handle.native.pop().unwrap();
    (new_from, new_to)
@@ -278,8 +278,8 @@ where
        boolean: vec![],
        imm: vec![],
    };
-    let res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_20_SIMD, src);
-    // Iop erc_20 return new_from, new_to
+    let res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_7984_SIMD, src);
+    // Iop erc_7984 return new_from, new_to
    let res = res_handle.native;
    res
 }
@@ -319,18 +319,18 @@ mod pbs_stats {
        let (_, _) = transfer_func(&from_amount, &to_amount, &amount);
        let count = tfhe::get_pbs_count();

-        println!("ERC20 transfer/{fn_name}::{type_name}: {count} PBS");
+        println!("ERC7984 transfer/{fn_name}::{type_name}: {count} PBS");

        let params = client_key.computation_parameters();
        let params_name = params.name();

        let test_name = if cfg!(feature = "gpu") {
-            format!("hlapi::cuda::erc20::pbs_count::{fn_name}::{params_name}::{type_name}")
+            format!("hlapi::cuda::erc7984::pbs_count::{fn_name}::{params_name}::{type_name}")
        } else {
-            format!("hlapi::erc20::pbs_count::{fn_name}::{params_name}::{type_name}")
+            format!("hlapi::erc7984::pbs_count::{fn_name}::{params_name}::{type_name}")
        };

-        let results_file = Path::new("erc20_pbs_count.csv");
+        let results_file = Path::new("erc7984_pbs_count.csv");
        if !results_file.exists() {
            File::create(results_file).expect("create results file failed");
        }
@@ -392,7 +392,7 @@ fn bench_transfer_latency<FheType, F>(
        &bench_id,
        params,
        params_name,
-        "erc20-transfer",
+        "erc7984-transfer",
        &OperatorType::Atomic,
        64,
        vec![],
@@ -413,7 +413,7 @@ fn bench_transfer_latency_simd<FheType, F>(
    F: for<'a> Fn(&'a Vec<FheType>, &'a Vec<FheType>, &'a Vec<FheType>) -> Vec<FheType>,
 {
    use tfhe::tfhe_hpu_backend::prelude::hpu_asm;
-    let hpu_simd_n = hpu_asm::iop::IOP_ERC_20_SIMD
+    let hpu_simd_n = hpu_asm::iop::IOP_ERC_7984_SIMD
        .format()
        .unwrap()
        .proto
@@ -453,7 +453,7 @@ fn bench_transfer_latency_simd<FheType, F>(
        &bench_id,
        params,
        params_name,
-        "erc20-simd-transfer",
+        "erc7984-simd-transfer",
        &OperatorType::Atomic,
        64,
        vec![],
@@ -507,7 +507,7 @@ fn bench_transfer_throughput<FheType, F>(
            &bench_id,
            params,
            &params_name,
-            "erc20-transfer",
+            "erc7984-transfer",
            &OperatorType::Atomic,
            64,
            vec![],
@@ -597,7 +597,7 @@ fn cuda_bench_transfer_throughput<FheType, F>(
        &bench_id,
        params,
        &params_name,
-        "erc20-transfer",
+        "erc7984-transfer",
        &OperatorType::Atomic,
        64,
        vec![],
@@ -661,7 +661,7 @@ fn hpu_bench_transfer_throughput<FheType, F>(
            &bench_id,
            params,
            &params_name,
-            "erc20-transfer",
+            "erc7984-transfer",
            &OperatorType::Atomic,
            64,
            vec![],
@@ -683,7 +683,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
    F: for<'a> Fn(&'a Vec<FheType>, &'a Vec<FheType>, &'a Vec<FheType>) -> Vec<FheType> + Sync,
 {
    use tfhe::tfhe_hpu_backend::prelude::hpu_asm;
-    let hpu_simd_n = hpu_asm::iop::IOP_ERC_20_SIMD
+    let hpu_simd_n = hpu_asm::iop::IOP_ERC_7984_SIMD
        .format()
        .unwrap()
        .proto
@@ -746,7 +746,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
            &bench_id,
            params,
            &params_name,
-            "erc20-simd-ransfer",
+            "erc7984-simd-ransfer",
            &OperatorType::Atomic,
            64,
            vec![],
@@ -769,7 +769,7 @@ fn main() {

    let mut c = Criterion::default().sample_size(10).configure_from_args();

-    let bench_name = "hlapi::erc20";
+    let bench_name = "hlapi::erc7984";

    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
@@ -896,7 +896,7 @@ fn main() {

    let mut c = Criterion::default().sample_size(10).configure_from_args();

-    let bench_name = "hlapi::cuda::erc20";
+    let bench_name = "hlapi::cuda::erc7984";

    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
@@ -1027,7 +1027,7 @@ fn main() {

    let mut c = Criterion::default().sample_size(10).configure_from_args();

-    let bench_name = "hlapi::hpu::erc20";
+    let bench_name = "hlapi::hpu::erc7984";

    match get_bench_type() {
        BenchmarkType::Latency => {
@@ -1040,7 +1040,7 @@ fn main() {
                "transfer::whitepaper",
                transfer_whitepaper::<FheUint64>,
            );
-            // Erc20 optimized instruction only available on Hpu
+            // Erc7984 optimized instruction only available on Hpu
            bench_transfer_latency(
                &mut group,
                &cks,
@@ -1049,7 +1049,7 @@ fn main() {
                "transfer::hpu_optim",
                transfer_hpu::<FheUint64>,
            );
-            // Erc20 SIMD instruction only available on Hpu
+            // Erc7984 SIMD instruction only available on Hpu
            bench_transfer_latency_simd(
                &mut group,
                &cks,
@@ -1071,7 +1071,7 @@ fn main() {
                "transfer::whitepaper",
                transfer_whitepaper::<FheUint64>,
            );
-            // Erc20 optimized instruction only available on Hpu
+            // Erc7984 optimized instruction only available on Hpu
            hpu_bench_transfer_throughput(
                &mut group,
                &cks,
@@ -1080,7 +1080,7 @@ fn main() {
                "transfer::hpu_optim",
                transfer_hpu::<FheUint64>,
            );
-            // Erc20 SIMD instruction only available on Hpu
+            // Erc7984 SIMD instruction only available on Hpu
            hpu_bench_transfer_throughput_simd(
                &mut group,
                &cks,
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -4,6 +4,7 @@ mod aes;
 mod aes256;
 mod kreyvium;
 mod oprf;
+mod shuffle;
 mod trivium;
 mod vector_find;

@@ -2790,6 +2791,7 @@ mod cuda {
        cuda_unchecked_rotate_left,
        cuda_unchecked_rotate_right,
        cuda_unchecked_ilog2,
+        shuffle::cuda::cuda_unchecked_bitonic_sort,
    );

    criterion_group!(
--- a/tfhe-benchmark/benches/integer/shuffle.rs
+++ b/tfhe-benchmark/benches/integer/shuffle.rs
@@ -0,0 +1,87 @@
+#[cfg(feature = "gpu")]
+pub mod cuda {
+    use benchmark::params_aliases::{
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS,
+    };
+    use benchmark::utilities::{write_to_json_unchecked, OperatorType};
+    use criterion::Criterion;
+    use rand::prelude::*;
+    use tfhe::core_crypto::gpu::CudaStreams;
+    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+    use tfhe::integer::gpu::CudaServerKey;
+    use tfhe::integer::keycache::KEY_CACHE;
+    use tfhe::integer::{IntegerKeyKind, RadixClientKey};
+    use tfhe::keycache::NamedParam;
+    use tfhe::shortint::AtomicPatternParameters;
+
+    fn bitonic_sort_scenarios() -> Vec<(usize, usize)> {
+        vec![(8, 32), (16, 32), (32, 32), (64, 32)]
+    }
+
+    fn bench_cuda_unchecked_bitonic_sort_for_params<P>(c: &mut Criterion, param: P)
+    where
+        P: Copy + NamedParam + Into<AtomicPatternParameters>,
+    {
+        let bench_name = "integer::cuda::unsigned::unchecked_bitonic_sort";
+        let mut group = c.benchmark_group(bench_name);
+        group
+            .sample_size(10)
+            .measurement_time(std::time::Duration::from_secs(60));
+
+        let atomic_param: AtomicPatternParameters = param.into();
+        let param_name = param.name();
+        let bits_per_block = atomic_param.message_modulus().0.ilog2() as usize;
+
+        let streams = CudaStreams::new_multi_gpu();
+        let (cpu_cks, _) = KEY_CACHE.get_from_params(atomic_param, IntegerKeyKind::Radix);
+        let sks = CudaServerKey::new(&cpu_cks, &streams);
+
+        let mut rng = rand::thread_rng();
+
+        for (num_elements, bit_size) in bitonic_sort_scenarios() {
+            let num_blocks = bit_size.div_ceil(bits_per_block);
+            let cks = RadixClientKey::from((cpu_cks.clone(), num_blocks));
+
+            let bench_id =
+                format!("{bench_name}::{param_name}::{bit_size}_bits::{num_elements}_elements");
+
+            group.bench_function(&bench_id, |b| {
+                b.iter_batched(
+                    || {
+                        (0..num_elements)
+                            .map(|_| {
+                                let clear: u64 = rng.gen();
+                                let ct = cks.encrypt(clear);
+                                CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &streams)
+                            })
+                            .collect::<Vec<_>>()
+                    },
+                    |mut values| {
+                        sks.unchecked_bitonic_sort(&mut values, &streams);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json_unchecked::<u64, _>(
+                &bench_id,
+                atomic_param,
+                param_name.as_str(),
+                "unchecked_bitonic_sort",
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![bits_per_block as u32; num_blocks],
+            );
+        }
+        group.finish();
+    }
+
+    pub fn cuda_unchecked_bitonic_sort(c: &mut Criterion) {
+        bench_cuda_unchecked_bitonic_sort_for_params(
+            c,
+            BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        );
+        bench_cuda_unchecked_bitonic_sort_for_params(c, BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS);
+    }
+}
--- a/tfhe-benchmark/src/bin/wasm_benchmarks_parser.rs
+++ b/tfhe-benchmark/src/bin/wasm_benchmarks_parser.rs
@@ -59,7 +59,7 @@ pub fn parse_wasm_benchmarks(results_file: &Path, raw_results_file: &Path) {
        let bench_name = name_parts[0];
        let params: PBSParameters = params_from_name(name_parts[1]).into();
        println!("{name_parts:?}");
-        if bench_name.contains("_size") {
+        if full_name.contains("_size") {
            write_result(&mut file, &prefixed_full_name, *val as usize);
        } else {
            let value_in_ns = (val * 1_000_000_f32) as usize;
--- a/tfhe-csprng/LICENSE
+++ b/tfhe-csprng/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/tfhe-fft/LICENSE
+++ b/tfhe-fft/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/tfhe-ntt/LICENSE
+++ b/tfhe-ntt/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -24,6 +24,7 @@ serde = { workspace = true, features = ["default", "derive"] }
 zeroize = "1.7.0"
 num-bigint = "0.4.5"
 tfhe-versionable = { version = "0.7.0", path = "../utils/tfhe-versionable" }
+tfhe-safe-serialize = { version = "0.1.0", path = "../utils/tfhe-safe-serialize" }
 zk-cuda-backend = { version = "0.1.0", path = "../backends/zk-cuda-backend", optional = true }
 tfhe-cuda-backend = { version = "=0.14.0", path = "../backends/tfhe-cuda-backend", optional = true }
 itertools.workspace = true
--- a/tfhe-zk-pok/LICENSE
+++ b/tfhe-zk-pok/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/tfhe-zk-pok/src/proofs/mod.rs
+++ b/tfhe-zk-pok/src/proofs/mod.rs
@@ -35,6 +35,44 @@ impl Display for ComputeLoad {
    }
 }

+impl From<ComputeLoad> for usize {
+    fn from(value: ComputeLoad) -> Self {
+        match value {
+            ComputeLoad::Proof => 0,
+            ComputeLoad::Verify => 1,
+        }
+    }
+}
+
+pub struct CompactPkeCrsConformanceParams {
+    lwe_dim: usize,
+    max_num_message: usize,
+    noise_bound: u64,
+    ciphertext_modulus: u64,
+    plaintext_modulus: u64,
+    msbs_zero_padding_bit_count: u64,
+}
+
+impl CompactPkeCrsConformanceParams {
+    pub fn new(
+        lwe_dim: usize,
+        max_num_message: usize,
+        noise_bound: u64,
+        ciphertext_modulus: u64,
+        plaintext_modulus: u64,
+        msbs_zero_padding_bit_count: u64,
+    ) -> Self {
+        Self {
+            lwe_dim,
+            max_num_message,
+            noise_bound,
+            ciphertext_modulus,
+            plaintext_modulus,
+            msbs_zero_padding_bit_count,
+        }
+    }
+}
+
 impl<T: ?Sized> OneBased<T> {
    pub fn new(inner: T) -> Self
    where
--- a/tfhe-zk-pok/src/proofs/pke/mod.rs
+++ b/tfhe-zk-pok/src/proofs/pke/mod.rs
@@ -14,6 +14,8 @@ use core::marker::PhantomData;

 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
+use tfhe_safe_serialize::{EnumSet, Named, ParameterSetConformant};
+use tfhe_versionable::Versionize;

 pub(crate) fn bit_iter(x: u64, nbits: u32) -> impl Iterator<Item = bool> {
    (0..nbits).map(move |idx| ((x >> idx) & 1) != 0)
@@ -44,6 +46,25 @@ pub struct PublicParams<G: Curve> {
    pub(crate) domain_separators: PKEv1DomainSeparators,
 }

+impl<G: Curve> Named for PublicParams<G> {
+    const NAME: &'static str = "zk::CompactPkePublicParams";
+}
+
+impl<G: Curve> ParameterSetConformant for PublicParams<G> {
+    type ParameterSet = CompactPkeCrsConformanceParams;
+
+    fn is_conformant(&self, parameter_set: &Self::ParameterSet) -> bool {
+        self.k <= self.d
+            && self.d == parameter_set.lwe_dim
+            && self.k == parameter_set.max_num_message
+            && self.b == parameter_set.noise_bound
+            && self.q == parameter_set.ciphertext_modulus
+            && self.t == parameter_set.plaintext_modulus
+            && self.msbs_zero_padding_bit_count == parameter_set.msbs_zero_padding_bit_count
+            && self.is_usable()
+    }
+}
+
 #[derive(Clone, Debug)]
 pub(crate) enum PKEv1DomainSeparators {
    Legacy(Box<LegacyPKEv1DomainSeparators>),
@@ -323,6 +344,52 @@ impl<G: Curve> Proof<G> {
    }
 }

+impl<G: Curve> ParameterSetConformant for Proof<G> {
+    type ParameterSet = CompactPkeV1ProofConformanceParams;
+
+    fn is_conformant(&self, parameter_set: &Self::ParameterSet) -> bool {
+        parameter_set
+            .accepted_compute_load
+            .contains(self.compute_load())
+            && self.is_usable()
+    }
+}
+
+#[derive(Copy, Clone)]
+/// Used to explicitly reject [`Proof`] v1 proofs that come with specific config
+pub struct CompactPkeV1ProofConformanceParams {
+    accepted_compute_load: EnumSet<ComputeLoad>,
+}
+
+impl Default for CompactPkeV1ProofConformanceParams {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CompactPkeV1ProofConformanceParams {
+    /// Create new params that accept all proof configurations
+    pub fn new() -> Self {
+        let mut accepted_compute_load = EnumSet::new();
+        accepted_compute_load.insert(ComputeLoad::Proof);
+        accepted_compute_load.insert(ComputeLoad::Verify);
+
+        Self {
+            accepted_compute_load,
+        }
+    }
+
+    /// Forbid proofs coming with the provided [`ComputeLoad`]
+    pub fn forbid_compute_load(self, forbidden_compute_load: ComputeLoad) -> Self {
+        let mut accepted_compute_load = self.accepted_compute_load;
+        accepted_compute_load.remove(forbidden_compute_load);
+
+        Self {
+            accepted_compute_load,
+        }
+    }
+}
+
 /// These fields can be pre-computed on the prover side in the faster Verifier scheme. If that's the
 /// case, they should be included in the proof.
 #[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize, Versionize)]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Enzo Di Maria	f6b8be25a7	feat(gpu): implement shuffle	2026-04-15 15:31:13 +02:00
Andrei Stoian	32cf1969bf	fix(gpu): semgrep step in pcc now fails on error	2026-04-15 14:20:34 +02:00
Andrei Stoian	600a30131e	chore(gpu): optimize CI	2026-04-15 12:48:31 +02:00
David Palm	96d230cf6f	chore: make CompressedXofKeySet::decompress take a reference	2026-04-14 16:24:33 +02:00
Nicolas Sarlin	4790f8ba1c	fix(bench): wrong size in wasm benchmarks	2026-04-14 11:17:11 +02:00
dependabot[bot]	79a54df25b	chore(deps): bump docker/login-action from 4.0.0 to 4.1.0 Bumps [docker/login-action](https://github.com/docker/login-action) from 4.0.0 to 4.1.0. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](`b45d80f862...4907a6ddec`) --- updated-dependencies: - dependency-name: docker/login-action dependency-version: 4.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-14 10:32:05 +02:00
Theo Souchon	50d6be121a	chore(test): refacto around noise check test and json output	2026-04-14 09:58:20 +02:00
Nicolas Sarlin	7cd966d8a7	chore: allow rand audit advisory	2026-04-14 08:54:05 +02:00
David Testé	6ca929051d	chore(ci): remove permanent instance fallback for gpu These fallback were set to mitigate Hyperstack resource shortages. Those instances are not used anymore and workflows are modified to avoid having a workflow run stuck because it waits for a permanent runner that doesn't exist.	2026-04-10 14:30:32 +02:00
Pedro Alves	871cc8f772	chore(docs): rewrite GPU ZK-PoK documentation for zk-cuda-backend integration	2026-04-10 08:40:08 -03:00
Theo Souchon	b938473788	chore: renamed erc20 to erc7984	2026-04-10 09:18:51 +02:00
Nicolas Sarlin	74869f5e2f	chore(integer): refactor expansion helper - Split the pure expand and the post processing (cast, unpack, sanitize) - Add a new internal intermediate type: ExpandedCiphertextList - verify_and_expand just calls verify+expand	2026-04-09 11:07:03 +02:00
dependabot[bot]	326dd6a5c7	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 5.0.1 to 5.0.4. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`70c4af2ed5...ca46236c6c`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-version: 5.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-08 09:42:31 +02:00
Guillermo Oyarzun	1abc69751a	feat(gpu): create noise and pfail tests for rerand	2026-04-07 20:33:31 +02:00
Pedro Alves	3c2cb273d5	chore(docs): add GPU ZK benchmark SVG generation to CI pipeline Add GPU ZK benchmark and SVG generation jobs to the documentation workflows, and fix the data extractor to handle the cuda::zk:: prefix chain in GPU ZK benchmark names.	2026-04-07 05:02:04 -03:00
dependabot[bot]	b18060e5c8	chore(deps): bump codecov/codecov-action from 5.5.2 to 6.0.0 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.5.2 to 6.0.0. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`671740ac38...57e3a136b7`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-version: 6.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-07 09:51:30 +02:00
dependabot[bot]	c8827a21a7	chore(deps): bump rust-lang/crates-io-auth-action from 1.0.3 to 1.0.4 Bumps [rust-lang/crates-io-auth-action](https://github.com/rust-lang/crates-io-auth-action) from 1.0.3 to 1.0.4. - [Release notes](https://github.com/rust-lang/crates-io-auth-action/releases) - [Commits](`b7e9a28ede...bbd81622f2`) --- updated-dependencies: - dependency-name: rust-lang/crates-io-auth-action dependency-version: 1.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-07 09:51:08 +02:00
Andrei Stoian	a7476d0aaa	chore(gpu): update benchmarks GPU fix(gpu): pbs benches fix(gpu): pbs benches	2026-04-03 11:29:51 +02:00
David Testé	10d104e500	chore: update copyright year to 2026	2026-04-03 10:22:13 +02:00
David Testé	dbb1f151c8	chore(ci): add release workflow for zk-cuda-backend	2026-04-01 11:14:31 +02:00
David Testé	9cb8ad9bff	chore(ci): create common cuda release workflow This refactorting is done to add zk-cuda-backend crate release without duplicating the logic in the new workflow.	2026-04-01 11:14:31 +02:00
David Testé	d970210ae4	chore(ci): update slab-github-runner action to v1.6.0 This action version now uses node24 as runner since node20 support is dropped on April 2026.	2026-04-01 09:47:44 +02:00
David Palm	5236c21733	chore: Move safe-serialization to own crate and wire it up with the workspace.	2026-03-31 16:30:40 +02:00
Guillermo Oyarzun	7598725c7e	feat(gpu): add pbs128 pattern to multi-bit noise test	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	f0cff6176d	feat(gpu): add cpk ks ms pattern to multi-bit noise tests	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	8bb38d4e70	feat(gpu): add packing ks multi-bit noise tests	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	35fe71cc07	feat(gpu): add br_dp_ks_ms pattern to multi-bit noise tests	2026-03-31 14:30:01 +02:00
Nicolas Sarlin	62429da859	chore(ci): publish tfhe-compat js package	2026-03-31 13:40:08 +02:00
dependabot[bot]	8a4b3c35f4	chore(deps): bump actions/cache from 5.0.3 to 5.0.4 Bumps [actions/cache](https://github.com/actions/cache) from 5.0.3 to 5.0.4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](`cdf6c1fa76...668228422a`) --- updated-dependencies: - dependency-name: actions/cache dependency-version: 5.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-03-30 14:31:27 +02:00
Theo Souchon	641fec028f	chore(lint): add message for the backward compat report when everything is ok	2026-03-30 13:25:58 +02:00
Nicolas Sarlin	8d8379409b	chore(shortint): remove 'parallel-wasm-api' feature gating This used to be required but now rayon handles this gracefully and runs the code sequentially when threads are not available	2026-03-30 11:58:52 +02:00
Nicolas Sarlin	d547e67f66	refactor(hl): factorize hl proven ct list expand code	2026-03-30 11:58:40 +02:00
Arthur Meyre	4cf03c063d	chore: update Cargo.locks for generate crates	2026-03-30 11:02:17 +02:00