chores(docs): minor corrections to the doc

fix(bench): wrong size in wasm benchmarks
chore(deps): bump docker/login-action from 4.0.0 to 4.1.0
2026-04-28 03:01:21 -04:00 · 2026-04-23 10:46:35 +02:00 · 2026-04-14 11:17:11 +02:00 · 2026-04-14 10:32:05 +02:00 · 2026-04-14 09:58:20 +02:00 · 2026-04-14 08:54:05 +02:00
251 changed files with 8907 additions and 4293 deletions
--- a/.cargo/audit.toml
+++ b/.cargo/audit.toml
@@ -4,6 +4,9 @@ ignore = [
    "RUSTSEC-2024-0436",
    # Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
    "RUSTSEC-2025-0141",
+    # Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
+    # arkworks and we do not use custom loggers.
+    "RUSTSEC-2026-0097",
 ]

 [output]
--- a/.github/workflows/aws_data_tests.yml
+++ b/.github/workflows/aws_data_tests.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -16,7 +16,6 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
@@ -37,6 +36,7 @@ jobs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -79,6 +79,7 @@ jobs:
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+              - utils/tfhe-safe-serialize/**
            csprng:
              - tfhe-csprng/**
            zk_pok:
@@ -86,6 +87,8 @@ jobs:
            versionable:
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+            safe_serialize:
+              - utils/tfhe-safe-serialize/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -122,6 +125,7 @@ jobs:
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -145,7 +149,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -170,6 +174,11 @@ jobs:
        run: |
          make test_versionable

+      - name: Run tfhe-safe-serialize tests
+        if: needs.should-run.outputs.safe_serialize_test == 'true'
+        run: |
+          make test_safe_serialize
+
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -191,7 +200,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -204,7 +213,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -34,7 +34,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -14,12 +14,11 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]

 permissions:
  contents: read
@@ -32,16 +31,16 @@ jobs:
    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read  # Needed to check for file change
+      pull-requests: read # Needed to check for file change
    outputs:
      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
-          steps.changed-files.outputs.wasm_any_changed }}
+        steps.changed-files.outputs.wasm_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -63,6 +62,7 @@ jobs:
                - tfhe/js_on_wasm_tests/**
                - tfhe/web_wasm_parallel_tests/**
                - utils/tfhe-versionable/**
+                - utils/tfhe-safe-serialize/**
                - .github/workflows/aws_tfhe_wasm_tests.yml

  wasm-tests:
@@ -78,7 +78,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -92,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -105,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -128,15 +128,21 @@ jobs:
        run: |
          make test_nodejs_wasm_api_ci

-      - name: Run parallel wasm tests
-        run: |
-          make test_web_js_api_parallel_chrome_ci
-
      - name: Run wasm_par_mq tests
        run: |
          make test_wasm_par_mq_chrome_ci
          make test_wasm_par_mq_firefox_ci

+      - name: Run parallel wasm tests
+        run: |
+          make test_web_js_api_parallel_chrome_ci
+          make test_web_js_api_parallel_firefox_ci
+
+      - name: Run cross origin wasm tests
+        run: |
+          make test_web_js_api_cross_origin_chrome_ci
+          make test_web_js_api_cross_origin_firefox_ci
+
      - name: Run x86_64/wasm zk compatibility tests
        run: |
          make test_zk_wasm_x86_compat_ci
--- a/.github/workflows/backward_compat_pr_change_report.yml
+++ b/.github/workflows/backward_compat_pr_change_report.yml
@@ -6,6 +6,9 @@ name: backward_compat_pr_change_report
 on:
  pull_request:

+env:
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
 permissions:
  contents: read

@@ -14,9 +17,35 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  should-run:
+    name: backward_compat_pr_change_report/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            backward:
+              - utils/tfhe-lints/snapshots/*.json
+
  change-report:
    name: backward_compat_pr_change_report/change-report (bpr)
    runs-on: ubuntu-latest
+    needs: should-run
+    if:
+      needs.should-run.outputs.backward_report == 'true'
    permissions:
      pull-requests: write # To send and modify message in the PR
    steps:
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -14,11 +14,12 @@ on:
          - signed_integer
          - integer_compression
          - integer_zk
+          - msm_zk
          - shortint
          - shortint_oprf
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
          - hlapi_kvstore
@@ -92,8 +93,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -107,7 +107,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set martix arguments outputs
+      - name: Set matrix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -108,14 +108,14 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-hlapi-erc20:
-    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
+  run-benchmarks-hlapi-erc7984:
+    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
-      command: hlapi_erc20
-      additional_file_to_parse: erc20_pbs_count.csv
+      command: hlapi_erc7984
+      additional_file_to_parse: erc7984_pbs_count.csv
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -17,6 +17,10 @@ on:
        description: "Run GPU core-crypto benchmarks"
        type: boolean
        default: true
+      run-gpu-zk-benchmarks:
+        description: "Run GPU ZK benchmarks"
+        type: boolean
+        default: true
      run-hpu-benchmarks:
        description: "Run HPU benchmarks"
        type: boolean
@@ -36,7 +40,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -91,7 +95,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc20
+      command: integer_multi_bit,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -110,7 +114,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -165,21 +169,42 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

+  run-benchmarks-gpu-zk-server:
+    name: benchmark_documentation/run-benchmarks-gpu-zk-server
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-zk-benchmarks
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: integer_zk
+      op_flavor: default
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
-      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
+      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
+      run-benchmarks-gpu-zk-server
    ]
    uses: ./.github/workflows/generate_svgs.yml
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -188,7 +213,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,10 +31,13 @@ on:
          - pbs128
          - ks
          - ks_pbs
+          - tfhe_zk_pok
+          - msm_zk
          - integer_zk
+          - integer_zk_experimental
          - integer_aes
          - integer_aes256
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
      op_flavor:
@@ -120,8 +123,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -111,7 +111,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set martix arguments outputs
+      - name: Set matrix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -126,17 +126,11 @@ jobs:
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,25 +139,6 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

-      - name: Acknowledge remote instance failure
-        if: steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile != 'single-h100'
-        run: |
-          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
-          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
-          exit 1
-        env:
-          INPUTS_PROFILE: ${{ inputs.profile }}
-
-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' &&
-          steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile == 'single-h100'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  # Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
  install-dependencies:
    name: benchmark_gpu_common/install-dependencies
@@ -184,7 +159,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -333,13 +307,13 @@ jobs:

  teardown-instance:
    name: benchmark_gpu_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -42,7 +42,7 @@ env:
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
  SCHEDULING_POLICY: "MAX_PARALLELISM"
-  BENCHMARKS: "erc20"
+  BENCHMARKS: "erc7984"
  BRANCH_NAME: ${{ github.ref_name }}
  COMMIT_SHA: ${{ github.sha }}
  SLAB_SECRET: ${{ secrets.JOB_SECRET }}
@@ -94,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: |
            ~/.cargo/registry
@@ -214,14 +214,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -248,13 +248,13 @@ jobs:
          npm install && npm run deploy:emptyProxies && npx hardhat compile
        working-directory: fhevm/

-      - name: Profile erc20 no-cmux benchmark on GPU
+      - name: Profile erc7984 no-cmux benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
          BENCHMARK_TYPE="THROUGHPUT_200" \
          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
-          make -e "profile_erc20_gpu"
+          make -e "profile_erc7984_gpu"
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
@@ -333,7 +333,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -14,7 +14,7 @@ on:
          - integer
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -95,7 +95,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set martix arguments outputs
+      - name: Set matrix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -387,7 +387,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -63,7 +63,7 @@ jobs:
          with open(env_file, "a") as f:
            f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")

-      - name: Set martix arguments output
+      - name: Set matrix arguments output
        id: set_matrix_arg
        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
@@ -77,7 +77,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -137,7 +137,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -158,9 +158,9 @@ jobs:
        env:
          BROWSER: ${{ matrix.browser }}

-      - name: Run benchmarks (unsafe coop)
+      - name: Run benchmarks (cross origin)
        run: |
-          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
+          make bench_web_js_api_cross_origin_"${BROWSER}"_ci
        env:
          BROWSER: ${{ matrix.browser }}

@@ -218,7 +218,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -94,7 +94,7 @@ jobs:
          with open(env_file, "a") as f:
            f.write(f"""RUNNERS=["{'", "'.join(runners)}"]\n""")

-      - name: Set martix runners outputs
+      - name: Set matrix runners outputs
        id: set_matrix_runners
        run: | # zizmor: ignore[template-injection] these env variable are safe
          echo "runners=${{ toJSON(env.RUNNERS) }}" >> "${GITHUB_OUTPUT}"
@@ -138,7 +138,7 @@ jobs:
      - name: Node cache restoration
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -151,7 +151,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -50,7 +50,7 @@ jobs:
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -74,7 +74,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -88,7 +88,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -209,60 +209,98 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

+  gpu-zk-server-latency-table:
+    name: generate_documentation_svgs/gpu-zk-server-latency-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: latency
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-latency
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  gpu-zk-server-throughput-table:
+    name: generate_documentation_svgs/gpu-zk-server-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: throughput
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
  # -----------------------------------------------------------
-  # ERC20 benchmarks tables
+  # ERC7984 benchmarks tables
  # -----------------------------------------------------------

-  cpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
+  cpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
      backend: cpu
      hardware_name: hpc7a.96xlarge
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
+      output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  gpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
+  gpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-gpu-svgs
    with:
      backend: gpu
      hardware_name: n3-H100-SXM5x8
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: multi_bit
      grouping_factor: 4
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
+      output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  hpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
+  hpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-hpu-svgs
    with:
      backend: hpu
      hardware_name: hpu_x1
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
+      output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -73,18 +73,12 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,13 +87,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -132,7 +119,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +162,14 @@ jobs:

  teardown-instance:
    name: gpu_core_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -77,7 +77,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,17 +25,11 @@ jobs:
    name: gpu_full_h100_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,13 +38,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: gpu_full_h100_tests/cuda-tests-linux
    needs: [ setup-instance ]
@@ -74,7 +61,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -118,13 +104,13 @@ jobs:

  teardown-instance:
    name: gpu_full_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +186,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -74,18 +74,12 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,13 +88,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -133,7 +120,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -184,14 +170,14 @@ jobs:

  teardown-instance:
    name: gpu_hlapi_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -176,7 +176,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +169,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -75,18 +75,12 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +89,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +121,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +162,14 @@ jobs:

  teardown-instance:
    name: gpu_signed_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +178,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +169,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -75,18 +75,12 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +89,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +121,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +162,14 @@ jobs:

  teardown-instance:
    name: gpu_unsigned_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +178,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -51,7 +51,16 @@ jobs:
        with:
          files_yaml: |
            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/zk/**
+              - tfhe-zk-pok/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

@@ -67,7 +76,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -126,6 +135,9 @@ jobs:
      - name: Run zk-cuda-backend integration tests
        run: |
          make test_zk_cuda_backend
+          make test_zk_pok_experimental_gpu
+          make test_integer_zk_gpu
+          make test_integer_zk_experimental_gpu

  slack-notify:
    name: gpu_zk_tests/slack-notify
@@ -158,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -107,7 +107,7 @@ jobs:
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
--- a/.github/workflows/make_release_common_cuda.yml
+++ b/.github/workflows/make_release_common_cuda.yml
@@ -1,12 +1,36 @@
-name: make_release_cuda
+# Common workflow to make crate release for CUDA backend
+name: make_release_common_cuda

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
-      dry_run:
-        description: "Dry-run"
+      package-name:
+        type: string
+        required: true
+      dry-run:
        type: boolean
        default: true
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true
+      ALLOWED_TEAM:
+        required: true
+      READ_ORG_TOKEN:
+        required: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -21,15 +45,15 @@ permissions: {}

 jobs:
  verify-triggering-actor:
-    name: make_release_cuda/verify-triggering-actor
+    name: make_release_common_cuda/verify-triggering-actor
    if: startsWith(github.ref, 'refs/tags/')
    uses: ./.github/workflows/verify_triggering_actor.yml
    secrets:
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  setup-instance:
-    name: make_release_cuda/setup-instance
+    name: make_release_common_cuda/setup-instance
    needs: verify-triggering-actor
    runs-on: ubuntu-latest
    outputs:
@@ -37,7 +61,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,7 +71,7 @@ jobs:
          profile: gpu-build

  package:
-    name: make_release_cuda/package
+    name: make_release_common_cuda/package
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    outputs:
@@ -76,7 +100,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -89,7 +112,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -101,12 +123,14 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Prepare package
+        env:
+          PACKAGE: ${{ inputs.package-name }}
        run: |
-          cargo package -p tfhe-cuda-backend
+          cargo package -p "${PACKAGE}"

      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate

      - name: generate hash
@@ -114,8 +138,8 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
-    name: make_release_cuda/provenance
-    if: ${{ !inputs.dry_run  }}
+    name: make_release_common_cuda/provenance
+    if: ${{ !inputs.dry-run  }}
    needs: [package]
    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
@@ -128,7 +152,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish-cuda-release:
-    name: make_release_cuda/publish-cuda-release
+    name: make_release_common_cuda/publish-cuda-release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    permissions:
@@ -150,7 +174,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -163,7 +186,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -177,22 +199,23 @@ jobs:
      - name: Download artifact
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
        env:
          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+          PACKAGE: ${{ inputs.package-name }}
+          DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
+          # would fail. This is safe since dry-run is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-cuda-backend ${DRY_RUN}
+          cargo publish -p "${PACKAGE}" ${DRY-RUN}

      - name: Generate hash
        id: published_hash
@@ -204,7 +227,7 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -212,17 +235,17 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: make_release_cuda/teardown-instance
+    name: make_release_common_cuda/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, publish-cuda-release]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -235,4 +258,4 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -16,6 +16,10 @@ on:
        description: "Push web js package"
        type: boolean
        default: true
+      push_web_compat_package:
+        description: "Push web compat (cross-origin) js package"
+        type: boolean
+        default: true
      push_node_package:
        description: "Push node js package"
        type: boolean
@@ -99,6 +103,23 @@ jobs:
          tag: ${{ env.NPM_TAG }}
          provenance: true

+      - name: Build web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        run: |
+          rm -rf tfhe/pkg
+
+          make build_web_js_api
+          sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
+
+      - name: Publish web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
+        with:
+          package: tfhe/pkg/package.json
+          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}
+          provenance: true
+
      - name: Build Node package
        if: ${{ inputs.push_node_package }}
        run: |
--- a/.github/workflows/make_release_tfhe_cuda.yml
+++ b/.github/workflows/make_release_tfhe_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of tfhe-rs CUDA backend on crates.io.
+name: make_release_tfhe_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "tfhe-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/make_release_tfhe_safe_serialize.yml
+++ b/.github/workflows/make_release_tfhe_safe_serialize.yml
@@ -0,0 +1,32 @@
+name: make_release_tfhe_safe_serialize
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_safe_serialize/make-release
+    uses: ./.github/workflows/make_release_common.yml
+    with:
+      package-name: "tfhe-safe-serialize"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
--- a/.github/workflows/make_release_zk_cuda.yml
+++ b/.github/workflows/make_release_zk_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
+name: make_release_zk_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_zk_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "zk-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Restore Sagemath image from cache
        id: docker-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
      - name: Store Sagemath image in cache
        if: steps.docker-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
+backends/tfhe-cuda-backend/cuda/build/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,7 @@ members = [
    "utils/tfhe-backward-compat-checker",
    "utils/tfhe-backward-compat-data",
    "utils/tfhe-backward-compat-data/crates/add_new_version",
+    "utils/tfhe-safe-serialize",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/wasm-par-mq",
@@ -44,6 +45,7 @@ rand = "0.8"
 rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = { version = "0.2.114" }
+wasm-bindgen-futures = { version = "0.4.56" }
 # js-sys (at this point in time) automatically enables the unsafe-eval feature which we do not want
 # this does not prevent other deps from enabling it, but it at least conveys our need to not have it
 # we still enable std, which was part of default before
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/240
+++ b/240
@@ -122,6 +122,12 @@ install_build_wasm32_target:
 	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
 	Rustup can be downloaded at https://rustup.rs/" && exit 1 )

+.PHONY: install_check_wasm32_target # Install the wasm32 toolchain used for checks
+install_check_wasm32_target:
+	rustup target add wasm32-unknown-unknown --toolchain "$(RS_CHECK_TOOLCHAIN)" || \
+	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
+	Rustup can be downloaded at https://rustup.rs/" && exit 1 )
+
 .PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
 install_cargo_nextest:
 	@cargo nextest --version > /dev/null 2>&1 || \
@@ -350,23 +356,23 @@ check_fmt_js: check_nvm_installed
 .PHONY: check_fmt_toml # Check TOML files format
 check_fmt_toml: install_taplo
 	@RUST_LOG=warn taplo fmt --check || \
-	echo "TOML files format check failed. Please run 'make fmt_toml'"
+	{ echo "TOML files format check failed. Please run 'make fmt_toml'"; exit 1; }

 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
-	@typos && echo "No typos found"
+	@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"

 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
 		--all-targets \
 		-p tfhe

@@ -380,7 +386,7 @@ clippy_hpu: install_rs_check_toolchain
 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

@@ -473,7 +479,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	fi && \
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
 		-p tfhe -- --nocapture

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -484,11 +490,17 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
 		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,parallel-wasm-api \
+		-p tfhe -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
@@ -529,6 +541,15 @@ clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings

+.PHONY: clippy_zk_pok_wasm # Run clippy lints on tfhe-zk-pok for wasm32 target
+clippy_zk_pok_wasm: install_rs_check_toolchain install_check_wasm32_target
+	RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--target wasm32-unknown-unknown \
+		-p tfhe-zk-pok -- --no-deps -D warnings
+	RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--target wasm32-unknown-unknown \
+		-p tfhe-zk-pok --features cross-origin-wasm -- --no-deps -D warnings
+
 .PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
 clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -536,6 +557,11 @@ clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-versionable -- --no-deps -D warnings

+.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
+clippy_safe_serialize: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-safe-serialize -- --no-deps -D warnings
+
 .PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
 clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -566,10 +592,12 @@ clippy_test_vectors: install_rs_check_toolchain
 	cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-test-vectors -- --no-deps -D warnings

+# WARNING: This target is not directly run in CI. When adding a subtarget here,
+# MAKE SURE TO ALSO ADD IT TO A PCC BATCH BELOW
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
+clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
 clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq

 .PHONY: clippy_fast # Run main clippy targets
@@ -666,7 +694,7 @@ build_c_api: install_rs_check_toolchain
 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
 		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -675,11 +703,14 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
 		-p tfhe

-.PHONY: build_web_js_api # Build the js API targeting the web browser
+.PHONY: build_web_js_api # Build the js API targeting the web browser, in sequential or cross origin parallelism modes.
 build_web_js_api: install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api && \
+	find pkg/snippets -type f -iname worker_helpers.js -exec sed -i 's|import("../../..")|import("../../../tfhe.js")|g' {} \;
+	cp utils/wasm-par-mq/js/coordinator.js tfhe/pkg/
+	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 # parallel wasm requires specific build options, see https://github.com/rust-lang/rust/pull/147225
@@ -765,7 +796,7 @@ test_zk_cuda_backend:


 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu:
@@ -1201,12 +1232,31 @@ test_tfhe_csprng_big_endian: install_cargo_cross
 	RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu

-
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok --features experimental

+.PHONY: test_zk_pok_experimental_gpu # Run tfhe-zk-pok GPU-accelerated tests
+test_zk_pok_experimental_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
+
+.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
+test_integer_zk_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--features=integer,zk-pok,gpu -p tfhe -- \
+		integer::gpu::zk::
+
+.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
+test_integer_zk_experimental_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
+		integer::gpu::zk::
+
+.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
+test_zk_cuda: test_zk_cuda_backend test_zk_pok_experimental_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
+
 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
@@ -1225,6 +1275,11 @@ test_versionable:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		--all-targets -p tfhe-versionable

+.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
+test_safe_serialize:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--all-targets -p tfhe-safe-serialize
+
 # The backward compat data folder holds historical binary data but also rust code to generate and load them.
 .PHONY: gen_backward_compat_data # Re-generate backward compatibility data
 gen_backward_compat_data:
@@ -1359,6 +1414,19 @@ test_nodejs_wasm_api_ci: build_node_js_api

 # This is an internal target, not meant to be called on its own.
 run_web_js_api_parallel: build_web_js_api_parallel setup_venv
+	cd $(WEB_SERVER_DIR) && npm install && npm run build
+	source venv/bin/activate && \
+	python ci/webdriver.py \
+	--browser-path $(browser_path) \
+	--driver-path $(driver_path) \
+	--browser-kind  $(browser_kind) \
+	--server-cmd $(server_cmd) \
+	--server-workdir "$(WEB_SERVER_DIR)" \
+	--id-pattern $(filter) \
+	--id-exclude-pattern asyncMainThread
+
+# This is an internal target, not meant to be called on its own.
+run_web_js_api_cross_origin: build_web_js_api setup_venv
 	cd $(WEB_SERVER_DIR) && npm install && npm run build
 	source venv/bin/activate && \
 	python ci/webdriver.py \
@@ -1401,6 +1469,38 @@ test_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) test_web_js_api_parallel_firefox

+test_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+test_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+test_web_js_api_cross_origin_chrome: browser_kind = chrome
+test_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
+test_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
+
+.PHONY: test_web_js_api_cross_origin_chrome # Run tests for the web wasm api in cross-origin mode on Chrome
+test_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
+
+.PHONY: test_web_js_api_cross_origin_chrome_ci # Run tests for the web wasm api in cross-origin mode on Chrome
+test_web_js_api_cross_origin_chrome_ci: setup_venv
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) test_web_js_api_cross_origin_chrome
+
+test_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
+test_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
+test_web_js_api_cross_origin_firefox: browser_kind = firefox
+test_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
+test_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeTest  # Only run zk proof tests in cross-origin mode
+
+.PHONY: test_web_js_api_cross_origin_firefox # Run tests for the web wasm api in cross-origin mode on Firefox
+test_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
+
+.PHONY: test_web_js_api_cross_origin_firefox_ci # Run tests for the web wasm api in cross-origin mode on Firefox
+test_web_js_api_cross_origin_firefox_ci: setup_venv
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) test_web_js_api_cross_origin_firefox
+
 WASM_PAR_MQ_TEST_DIR=utils/wasm-par-mq/web_tests

 .PHONY: build_wasm_par_mq_tests # Build the wasm-par-mq test WASM package
@@ -1564,27 +1664,50 @@ bench_integer_rerand_gpu: install_rs_check_toolchain
 	--bench integer-rerand \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --

+.PHONY: bench_msm_zk
+bench_msm_zk: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-msm \
+	--features=zk-pok -p tfhe-benchmark --profile release --
+
+# GPU benchmarks need --profile release for correct measurements
+.PHONY: bench_msm_zk_gpu
+bench_msm_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-msm \
+	--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release -- zk::cuda::msm
+
+# GPU benchmarks need --profile release for correct measurements
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
+
+# GPU benchmarks need --profile release for correct measurements
+.PHONY: bench_integer_zk_experimental_gpu
+bench_integer_zk_experimental_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --

 .PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
 bench_integer_aes_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
 bench_integer_aes256_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes256 \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
 bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1748,37 +1871,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_parallel_firefox

-bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
-bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
-bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
+bench_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+bench_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+bench_web_js_api_cross_origin_chrome: browser_kind = chrome
+bench_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
+bench_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers

-.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
+.PHONY: bench_web_js_api_cross_origin_chrome # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin

-.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
+.PHONY: bench_web_js_api_cross_origin_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_chrome_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_unsafe_coop_chrome
+	$(MAKE) bench_web_js_api_cross_origin_chrome

-bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
-bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
-bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
+bench_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
+bench_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
+bench_web_js_api_cross_origin_firefox: browser_kind = firefox
+bench_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
+bench_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers

-.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
+.PHONY: bench_web_js_api_cross_origin_firefox # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin

-.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
+.PHONY: bench_web_js_api_cross_origin_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_firefox_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_unsafe_coop_firefox
+	$(MAKE) bench_web_js_api_cross_origin_firefox

 .PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
 bench_hlapi_unsigned: install_rs_check_toolchain
@@ -1811,25 +1934,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
-bench_hlapi_erc20: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
+bench_hlapi_erc7984: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
-bench_hlapi_erc20_gpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
+bench_hlapi_erc7984_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
-bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
+bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
@@ -1853,13 +1976,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
-bench_hlapi_erc20_hpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc7984_hpu: install_rs_check_toolchain
 	source ./setup_hpu.sh --config $(HPU_CONFIG); \
 	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
@@ -1867,6 +1990,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

+.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
+bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--package tfhe-zk-pok \
+	--features=gpu-experimental --profile release
+
 .PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
 bench_hlapi_noise_squash: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
@@ -1908,10 +2038,10 @@ bench_summary: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'

 	# DEX
@@ -1953,10 +2083,10 @@ bench_summary_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'

 	# DEX
@@ -2143,8 +2273,10 @@ pcc_batch_6:
 	$(call run_recipe_with_details,clippy_tasks)
 	$(call run_recipe_with_details,clippy_tfhe_csprng)
 	$(call run_recipe_with_details,clippy_zk_pok)
+	$(call run_recipe_with_details,clippy_zk_pok_wasm)
 	$(call run_recipe_with_details,clippy_trivium)
 	$(call run_recipe_with_details,clippy_versionable)
+	$(call run_recipe_with_details,clippy_safe_serialize)
 	$(call run_recipe_with_details,clippy_param_dedup)
 	$(call run_recipe_with_details,docs)

--- a/_typos.toml
+++ b/_typos.toml
@@ -15,12 +15,3 @@ extend-ignore-identifiers-re = [
    "0x[0-9a-fA-F]+",
    "xrt_coreutil",
 ]
-
-[files]
-extend-exclude = [
-    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
-    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
-    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
-    "*.cbor",
-    "*.bcode",
-]
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
+++ b/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
@@ -62,3 +62,29 @@ rules:
            cuda_synchronize_stream(...);
            ...
          }
+
+  - id: tfhe-cuda-unwrapped-cuda-runtime-call
+    message: "CUDA runtime API call is not wrapped in `check_cuda_error(...)`."
+    severity: WARNING
+    languages: [c, cpp]
+    options:
+      generic_ellipsis_max_span: 500
+    paths:
+      include:
+        - "*.cu"
+        - "*.cuh"
+        - "*.cpp"
+        - "*.h"
+      exclude:
+        - backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
+        - backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
+    patterns:
+      - pattern: $FUNC(...)
+      - metavariable-regex:
+          metavariable: $FUNC
+          regex: "^cuda[A-Z][A-Za-z0-9]*$" # matches cudaMalloc/cudaMemcpy/... (not project helpers like cuda_set_device)
+      - pattern-not-inside: check_cuda_error(...)
+      - pattern-not-inside: |
+          $FUNC(...);
+          check_cuda_error(cudaGetLastError());
+      - pattern-not-inside: $FUNC(...) == $VAL
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -382,14 +382,17 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                       ->use_sequential_algorithm_to_resolve_group_carries;

    cuda_set_device(0);
-    cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming);
+    check_cuda_error(
+        cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
    create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
                                    use_seq, allocate_gpu_memory, size_tracker);
-    cudaEventRecord(create_indexes_done, streams.stream(0));
+    check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
    cuda_set_device(1);
-    cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0);
+    check_cuda_error(
+        cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
    cuda_set_device(2);
-    cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0);
+    check_cuda_error(
+        cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));

    scatter_indexes_for_overflowing_sub(
        streams.stream(1), streams.gpu_index(1),
@@ -842,7 +845,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    free(second_indexes_for_overflow_sub_gpu_2);
    free(scalars_for_overflow_sub_gpu_2);

-    cudaEventDestroy(create_indexes_done);
+    check_cuda_error(cudaEventDestroy(create_indexes_done));

    // release sub streams
    sub_streams_1.release();
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -326,6 +326,10 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index) {
  if (size == 0)
    return;
+
+  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
+  GPU_ASSERT(dest != nullptr, "Cuda error: null device ptr");
+
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
  PANIC_IF_FALSE(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -373,7 +373,8 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  check_cuda_error(
+      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -517,7 +517,8 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  check_cuda_error(
+      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -784,9 +784,9 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc_128(
      device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
      cudaFuncAttributeMaxDynamicSharedMemorySize,
      full_sm)); // full_sm + minimum_sm_tbc));
-  cudaFuncSetCacheConfig(
+  check_cuda_error(cudaFuncSetCacheConfig(
      device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
-      cudaFuncCachePreferShared);
+      cudaFuncCachePreferShared));
  check_cuda_error(cudaFuncSetAttribute(
      device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
      cudaFuncAttributeNonPortableClusterSizeAllowed, true));
@@ -1271,7 +1271,8 @@ __host__ bool verify_cuda_programmable_bootstrap_128_cg_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  check_cuda_error(
+      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));

  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -1212,46 +1212,47 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
  int max_active_blocks_per_sm;

  if (max_shared_memory < partial_sm_cg_accumulate) {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            Torus, params, NOSM>,
-        thds, 0);
+        thds, 0));
  } else if (max_shared_memory < full_sm_cg_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
-    cudaFuncSetCacheConfig(
+    check_cuda_error(cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  PARTIALSM>,
-        cudaFuncCachePreferShared);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            Torus, params, PARTIALSM>,
-        thds, partial_sm_cg_accumulate);
+        thds, partial_sm_cg_accumulate));
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
-    cudaFuncSetCacheConfig(
+    check_cuda_error(cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  FULLSM>,
-        cudaFuncCachePreferShared);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            Torus, params, FULLSM>,
-        thds, full_sm_cg_accumulate);
+        thds, full_sm_cg_accumulate));
    check_cuda_error(cudaGetLastError());
  }

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  check_cuda_error(
+      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -739,7 +739,8 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  check_cuda_error(
+      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -136,9 +136,6 @@ pub type Direction = ffi::c_uint;
 pub const BitValue_Zero: BitValue = 0;
 pub const BitValue_One: BitValue = 1;
 pub type BitValue = ffi::c_uint;
-pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
-pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
-pub type RERAND_MODE = ffi::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct CudaStreamsFFI {
@@ -2479,6 +2476,9 @@ unsafe extern "C" {
        glwe_index: u32,
    );
 }
+pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
+pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
+pub type RERAND_MODE = ffi::c_uint;
 unsafe extern "C" {
    pub fn scratch_cuda_rerand_64_async(
        streams: CudaStreamsFFI,
@@ -2491,7 +2491,7 @@ unsafe extern "C" {
        message_modulus: u32,
        carry_modulus: u32,
        allocate_gpu_memory: bool,
-        rerand_type: u32,
+        rerand_type: RERAND_MODE,
    ) -> u64;
 }
 unsafe extern "C" {
--- a/backends/tfhe-hpu-backend/LICENSE
+++ b/backends/tfhe-hpu-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-hpu-backend/README.md
+++ b/backends/tfhe-hpu-backend/README.md
@@ -297,8 +297,8 @@ source setup_hpu.sh --config v80 -p
 # Run hlapi benches
 make test_high_level_api_hpu

-# Run hlapi erc20 benches
-make bench_hlapi_erc20_hpu 
+# Run hlapi erc7984 benches
+make bench_hlapi_erc7984_hpu 

 # Run integer level benches
 make bench_integer_hpu
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
@@ -109,7 +109,7 @@
  flush_behaviour = "Patient"
  flush = true

-[firmware.op_cfg.by_op.ERC_20]
+[firmware.op_cfg.by_op.ERC_7984]
  fill_batch_fifo = true
  min_batch_size = false
  use_tiers = true
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -121,7 +121,7 @@
  flush_behaviour = "Patient"
  flush = true

-[firmware.op_cfg.by_op.ERC_20]
+[firmware.op_cfg.by_op.ERC_7984]
  fill_batch_fifo = true
  min_batch_size = false
  use_tiers = true
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -230,7 +230,7 @@ iop!(
    [IOP_CMP -> "CMP_NEQ", opcode::CMP_NEQ],
    [IOP_CT_F_CT_BOOL -> "IF_THEN_ZERO", opcode::IF_THEN_ZERO],
    [IOP_CT_F_2CT_BOOL -> "IF_THEN_ELSE", opcode::IF_THEN_ELSE],
-    [IOP_2CT_F_3CT -> "ERC_20", opcode::ERC_20],
+    [IOP_2CT_F_3CT -> "ERC_7984", opcode::ERC_7984],
    [IOP_CT_F_CT -> "MEMCPY", opcode::MEMCPY],
    [IOP_CT_F_CT -> "ILOG2", opcode::ILOG2],
    [IOP_CT_F_CT -> "COUNT0", opcode::COUNT0],
@@ -240,5 +240,5 @@ iop!(
    [IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
    [IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
    [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
-    [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
+    [IOP_2NCT_F_3NCT -> "ERC_7984_SIMD", opcode::ERC_7984_SIMD],
 );
--- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
@@ -74,9 +74,9 @@ pub const IF_THEN_ZERO: u8 = 0xCA;
 pub const IF_THEN_ELSE: u8 = 0xCB;

 // Custom algorithm
-// ERC20 -> Found xfer algorithm
+// ERC7984 -> Found xfer algorithm
 // 2Ct <- func(3Ct)
-pub const ERC_20: u8 = 0x80;
+pub const ERC_7984: u8 = 0x80;

 // Count bits
 pub const COUNT0: u8 = 0x81;
@@ -89,7 +89,7 @@ pub const TRAIL1: u8 = 0x87;

 // SIMD for maximum throughput
 pub const ADD_SIMD: u8 = 0xF0;
-pub const ERC_20_SIMD: u8 = 0xF1;
+pub const ERC_7984_SIMD: u8 = 0xF1;
 //
 // Utility operations
 // Used to handle real clone of ciphertext already uploaded in the Hpu memory
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs
@@ -31,7 +31,7 @@ crate::impl_fw!("Demo" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_20 => fw_impl::ilp::iop_erc_20;
+    ERC_7984 => fw_impl::ilp::iop_erc_7984;

    CMP_GT  => cmp_gt;
    CMP_GTE => cmp_gte;
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
@@ -61,7 +61,7 @@ crate::impl_fw!("Ilp" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_20 => fw_impl::ilp::iop_erc_20;
+    ERC_7984 => fw_impl::ilp::iop_erc_7984;

    MEMCPY => fw_impl::ilp::iop_memcpy;

@@ -74,7 +74,7 @@ crate::impl_fw!("Ilp" [
    TRAIL1 => fw_impl::ilp_log::iop_trail1;
    // SIMD Implementations
    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
+    ERC_7984_SIMD  => fw_impl::llt::iop_erc_7984_simd;
 ]);

 #[instrument(level = "trace", skip(prog))]
@@ -1296,13 +1296,13 @@ pub fn iop_if_then_else(prog: &mut Program) {
        });
 }

-/// Implement erc_20 fund xfer
+/// Implement erc_7984 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
 /// 3. Compute new amount (from - new_amount, to + new_amount)
 #[instrument(level = "info", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
+pub fn iop_erc_7984(prog: &mut Program) {
    // Allocate metavariables:
    // Dest -> Operand
    let mut dst_from = prog.iop_template_var(OperandKind::Dst, 0);
@@ -1314,7 +1314,7 @@ pub fn iop_erc_20(prog: &mut Program) {
    let src_amount = prog.iop_template_var(OperandKind::Src, 2);

    // Add Comment header
-    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());

    let props = prog.params();
    let tfhe_params: asm::DigitParameters = props.clone().into();
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -70,7 +70,7 @@ crate::impl_fw!("Llt" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_20 => fw_impl::llt::iop_erc_20;
+    ERC_7984 => fw_impl::llt::iop_erc_7984;
    MEMCPY => fw_impl::ilp::iop_memcpy;

    COUNT0 => fw_impl::ilp_log::iop_count0;
@@ -83,7 +83,7 @@ crate::impl_fw!("Llt" [

    // SIMD Implementations
    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
+    ERC_7984_SIMD  => fw_impl::llt::iop_erc_7984_simd;
 ]);

 // ----------------------------------------------------------------------------
@@ -225,24 +225,24 @@ pub fn iop_muls(prog: &mut Program) {
 }

 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
+pub fn iop_erc_7984(prog: &mut Program) {
    // Add Comment header
-    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
    // TODO: Make sweep of kogge_blk_w
    // All these little parameters would be very handy to write an
    // exploration/compilation program which would try to minimize latency by
    // playing with these.
-    iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
+    iop_erc_7984_rtl(prog, 0, Some(10)).add_to_prog(prog);
 }

 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_simd(prog: &mut Program) {
+pub fn iop_erc_7984_simd(prog: &mut Program) {
    // Add Comment header
-    prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_7984_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
    simd(
        prog,
        crate::asm::iop::SIMD_N,
-        fw_impl::llt::iop_erc_20_rtl,
+        fw_impl::llt::iop_erc_7984_rtl,
        None,
    );
 }
@@ -379,7 +379,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 // Helper Functions
 // ----------------------------------------------------------------------------

-/// Implement erc_20 fund xfer
+/// Implement erc_7984 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
@@ -391,7 +391,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 ///     (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
 /// Where N is the batch size
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
+pub fn iop_erc_7984_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
    // Allocate metavariables:
    // Dest -> Operand
    let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
--- a/backends/zk-cuda-backend/Cargo.toml
+++ b/backends/zk-cuda-backend/Cargo.toml
@@ -24,7 +24,7 @@ bindgen.workspace = true
 [dependencies]
 ark-ec.workspace = true
 ark-ff.workspace = true
-tfhe-cuda-backend = { version = "=0.14.0", path = "../tfhe-cuda-backend" }
+tfhe-cuda-backend = { version = "0.14.0", path = "../tfhe-cuda-backend" }

 [features]
 default = []
--- a/backends/zk-cuda-backend/cuda/include/msm.h
+++ b/backends/zk-cuda-backend/cuda/include/msm.h
@@ -97,28 +97,23 @@ size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index);
 //   d_scalars: Device pointer to input BigInt scalars (array of n scalars)
 //   n: Number of points/scalars
 //   d_scratch: Caller-provided device scratch buffer for intermediate results
-//   size_tracker: Reference for tracking GPU memory allocation sizes
 void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
                        G1Projective *h_result, const G1Affine *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G1Projective *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated);
+                        G1Projective *d_scratch);

 void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
                  G1Projective *h_result, const G1Affine *d_points,
-                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
-                  uint64_t &size_tracker, bool gpu_memory_allocated);
+                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch);

 // MSM for G2 points with BigInt scalars (projective result)
 // Result is written directly to a host pointer.
 void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
                        G2ProjectivePoint *h_result, const G2Point *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated);
+                        G2ProjectivePoint *d_scratch);

 void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
                  G2ProjectivePoint *h_result, const G2Point *d_points,
                  const Scalar *d_scalars, uint32_t n,
-                  G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                  bool gpu_memory_allocated);
+                  G2ProjectivePoint *d_scratch);
--- a/backends/zk-cuda-backend/cuda/src/msm/msm.cu
+++ b/backends/zk-cuda-backend/cuda/src/msm/msm.cu
@@ -8,17 +8,16 @@
 // Multi-Scalar Multiplication (MSM) using Pippenger algorithm for BLS12-446

 // Forward declarations for Pippenger implementations
-void point_msm_g1_pippenger_async(
-    cudaStream_t stream, uint32_t gpu_index, G1Projective *h_result,
-    const G1Affine *d_points, const Scalar *d_scalars, uint32_t n,
-    G1Projective *d_scratch, uint64_t &size_tracker, bool gpu_memory_allocated);
+void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
+                                  G1Projective *h_result,
+                                  const G1Affine *d_points,
+                                  const Scalar *d_scalars, uint32_t n,
+                                  G1Projective *d_scratch);
 void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
                                  G2ProjectivePoint *h_result,
                                  const G2Point *d_points,
                                  const Scalar *d_scalars, uint32_t n,
-                                  G2ProjectivePoint *d_scratch,
-                                  uint64_t &size_tracker,
-                                  bool gpu_memory_allocated);
+                                  G2ProjectivePoint *d_scratch);

 // ============================================================================
 // Public MSM API for BigInt scalars
@@ -29,11 +28,9 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
 void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
                        G1Projective *h_result, const G1Affine *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G1Projective *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated) {
+                        G1Projective *d_scratch) {
  point_msm_g1_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
-                               n, d_scratch, size_tracker,
-                               gpu_memory_allocated);
+                               n, d_scratch);
 }

 // MSM with BigInt scalars for G2 (projective coordinates internally)
@@ -41,19 +38,17 @@ void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
 void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
                        G2ProjectivePoint *h_result, const G2Point *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated) {
+                        G2ProjectivePoint *d_scratch) {
  point_msm_g2_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
-                               n, d_scratch, size_tracker,
-                               gpu_memory_allocated);
+                               n, d_scratch);
 }

 void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
                  G1Projective *h_result, const G1Affine *d_points,
-                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
-                  uint64_t &size_tracker, bool gpu_memory_allocated) {
+                  const Scalar *d_scalars, uint32_t n,
+                  G1Projective *d_scratch) {
  point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                     d_scratch, size_tracker, gpu_memory_allocated);
+                     d_scratch);
  // The async impl already syncs internally before the CPU-side Horner phase,
  // so the stream is idle here. This sync is kept for defensive correctness.
  cuda_synchronize_stream(stream, gpu_index);
@@ -62,10 +57,9 @@ void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
 void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
                  G2ProjectivePoint *h_result, const G2Point *d_points,
                  const Scalar *d_scalars, uint32_t n,
-                  G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                  bool gpu_memory_allocated) {
+                  G2ProjectivePoint *d_scratch) {
  point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                     d_scratch, size_tracker, gpu_memory_allocated);
+                     d_scratch);
  // See comment in point_msm_g1 above.
  cuda_synchronize_stream(stream, gpu_index);
 }
--- a/backends/zk-cuda-backend/cuda/src/msm/pippenger/msm_pippenger.cu
+++ b/backends/zk-cuda-backend/cuda/src/msm/pippenger/msm_pippenger.cu
@@ -493,12 +493,13 @@ void horner_combine_cpu(ProjectiveType &result,
 // window sums. The caller is responsible for allocating and freeing this
 // buffer.
 template <typename AffineType, typename ProjectiveType>
-void point_msm_pippenger_impl_async(
-    cudaStream_t stream, uint32_t gpu_index, ProjectiveType *h_result,
-    const AffineType *d_points, const Scalar *d_scalars, uint32_t n,
-    uint32_t threads_per_block, uint32_t window_size, uint32_t bucket_count,
-    ProjectiveType *d_scratch, uint64_t &size_tracker,
-    bool gpu_memory_allocated) {
+void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
+                                    ProjectiveType *h_result,
+                                    const AffineType *d_points,
+                                    const Scalar *d_scalars, uint32_t n,
+                                    uint32_t threads_per_block,
+                                    uint32_t window_size, uint32_t bucket_count,
+                                    ProjectiveType *d_scratch) {
  using ProjectivePoint = Projective<ProjectiveType>;

  if (n == 0) {
@@ -705,16 +706,13 @@ void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
                                  G1Projective *h_result,
                                  const G1Affine *d_points,
                                  const Scalar *d_scalars, uint32_t n,
-                                  G1Projective *d_scratch,
-                                  uint64_t &size_tracker,
-                                  bool gpu_memory_allocated) {
+                                  G1Projective *d_scratch) {
  uint32_t window_size, bucket_count;
  get_g1_window_params(n, window_size, bucket_count);

  point_msm_pippenger_impl_async<G1Affine, G1Projective>(
      stream, gpu_index, h_result, d_points, d_scalars, n,
-      msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch,
-      size_tracker, gpu_memory_allocated);
+      msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch);
 }

 // MSM with BigInt scalars for G2 (projective coordinates internally)
@@ -724,14 +722,11 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
                                  G2ProjectivePoint *h_result,
                                  const G2Point *d_points,
                                  const Scalar *d_scalars, uint32_t n,
-                                  G2ProjectivePoint *d_scratch,
-                                  uint64_t &size_tracker,
-                                  bool gpu_memory_allocated) {
+                                  G2ProjectivePoint *d_scratch) {
  uint32_t window_size, bucket_count;
  get_g2_window_params(n, window_size, bucket_count);

  point_msm_pippenger_impl_async<G2Point, G2ProjectivePoint>(
      stream, gpu_index, h_result, d_points, d_scalars, n,
-      msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch,
-      size_tracker, gpu_memory_allocated);
+      msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch);
 }
--- a/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
+++ b/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
@@ -187,37 +187,82 @@ __host__ __device__ void fp_copy(Fp &dst, const Fp &src) {
 // "Raw" means without modular reduction - performs a + b and returns carry.
 // This is an internal helper used by fp_add() which handles reduction.
 __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
+#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
+  // PTX carry-chain: add.cc sets the hardware carry flag, addc.cc propagates
+  // it. This replaces 2 software carry-detect comparisons per limb (~14 extra
+  // instructions across 7 limbs) with zero-cost hardware flag propagation.
+  uint64_t carry_out;
+  asm("add.cc.u64   %0,  %8,  %15;\n\t" // c[0] = a[0] + b[0], set CF
+      "addc.cc.u64  %1,  %9,  %16;\n\t" // c[1] = a[1] + b[1] + CF
+      "addc.cc.u64  %2,  %10, %17;\n\t" // c[2] = a[2] + b[2] + CF
+      "addc.cc.u64  %3,  %11, %18;\n\t" // c[3] = a[3] + b[3] + CF
+      "addc.cc.u64  %4,  %12, %19;\n\t" // c[4] = a[4] + b[4] + CF
+      "addc.cc.u64  %5,  %13, %20;\n\t" // c[5] = a[5] + b[5] + CF
+      "addc.cc.u64  %6,  %14, %21;\n\t" // c[6] = a[6] + b[6] + CF
+      "addc.u64     %7,  0,   0;\n\t"   // carry_out = 0 + 0 + CF
+      : "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
+        "=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(carry_out)
+      : "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
+        "l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
+        "l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
+        "l"(b.limb[5]), "l"(b.limb[6]));
+  return carry_out;
+#else
+  // Host path: portable software carry detection
  UNSIGNED_LIMB carry = 0;

  for (int i = 0; i < FP_LIMBS; i++) {
-    // Add with carry: c = a + b + carry
    UNSIGNED_LIMB sum = a.limb[i] + carry;
-    carry = (sum < a.limb[i]) ? 1 : 0; // Check for overflow
+    carry = (sum < a.limb[i]) ? 1 : 0;
    sum += b.limb[i];
-    carry += (sum < b.limb[i]) ? 1 : 0; // Check for overflow
+    carry += (sum < b.limb[i]) ? 1 : 0;
    c.limb[i] = sum;
  }

  return carry;
+#endif
 }

 // Subtraction with borrow propagation
 // "Raw" means without modular reduction - performs a - b and returns borrow.
 // This is an internal helper used by fp_sub() which handles reduction.
 __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
+#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
+  // PTX borrow-chain: sub.cc sets the hardware borrow flag, subc.cc propagates
+  // it. Same benefit as fp_add_raw -- eliminates 2 comparisons per limb.
+  uint64_t borrow_out;
+  asm("sub.cc.u64   %0,  %8,  %15;\n\t" // c[0] = a[0] - b[0], set CF
+      "subc.cc.u64  %1,  %9,  %16;\n\t" // c[1] = a[1] - b[1] - CF
+      "subc.cc.u64  %2,  %10, %17;\n\t" // c[2] = a[2] - b[2] - CF
+      "subc.cc.u64  %3,  %11, %18;\n\t" // c[3] = a[3] - b[3] - CF
+      "subc.cc.u64  %4,  %12, %19;\n\t" // c[4] = a[4] - b[4] - CF
+      "subc.cc.u64  %5,  %13, %20;\n\t" // c[5] = a[5] - b[5] - CF
+      "subc.cc.u64  %6,  %14, %21;\n\t" // c[6] = a[6] - b[6] - CF
+      "subc.u64     %7,  0,   0;\n\t"   // borrow_out = 0 - 0 - CF
+      : "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
+        "=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(borrow_out)
+      : "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
+        "l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
+        "l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
+        "l"(b.limb[5]), "l"(b.limb[6]));
+  // subc.u64 with 0-0-CF produces 0 if no borrow, or 0xFFFFFFFFFFFFFFFF if
+  // borrow. Normalize to 0/1 for callers that check (borrow != 0) or add it.
+  return borrow_out & 1;
+#else
+  // Host path: portable software borrow detection
  UNSIGNED_LIMB borrow = 0;

  for (int i = 0; i < FP_LIMBS; i++) {
-    // Subtract with borrow: c = a - b - borrow
    UNSIGNED_LIMB diff = a.limb[i] - borrow;
-    borrow = (diff > a.limb[i]) ? 1 : 0; // Check for underflow
+    borrow = (diff > a.limb[i]) ? 1 : 0;
    UNSIGNED_LIMB old_diff = diff;
    diff -= b.limb[i];
-    borrow += (diff > old_diff) ? 1 : 0; // Check for underflow
+    borrow += (diff > old_diff) ? 1 : 0;
    c.limb[i] = diff;
  }

  return borrow;
+#endif
 }

 // Addition with modular reduction: c = (a + b) mod p
@@ -226,7 +271,27 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
  Fp sum;
  UNSIGNED_LIMB carry = fp_add_raw(sum, a, b);

-  // If there's a carry or sum >= MODULUS, we need to reduce
+#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
+  // Branchless reduction: always compute sum - p, then select based on
+  // whether reduction was needed. This avoids divergent branches that stall
+  // warps when some threads need reduction and others don't.
+  //
+  // Decision logic:
+  //   carry=1 -> sum overflowed 448 bits, definitely >= p -> use reduced
+  //   carry=0, borrow=0 -> sum >= p in 448 bits -> use reduced
+  //   carry=0, borrow=1 -> sum < p -> use original sum
+  // So: use_original = (!carry) & borrow
+  Fp reduced;
+  UNSIGNED_LIMB borrow = fp_sub_raw(reduced, sum, fp_modulus());
+  UNSIGNED_LIMB use_original = ((carry ^ 1) & borrow);
+  UNSIGNED_LIMB mask =
+      -use_original; // all-ones if keep sum, all-zeros if keep reduced
+
+  for (int i = 0; i < FP_LIMBS; i++) {
+    c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
+  }
+#else
+  // Host path: branching is fine on CPU (branch predictor handles it well)
  const Fp &p = fp_modulus();
  if (carry || fp_cmp(sum, p) != ComparisonType::Less) {
    Fp reduced;
@@ -235,6 +300,7 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
  } else {
    fp_copy(c, sum);
  }
+#endif
 }

 // Subtraction with modular reduction: c = (a - b) mod p
@@ -243,13 +309,28 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
  Fp diff;
  UNSIGNED_LIMB borrow = fp_sub_raw(diff, a, b);

-  // If there was a borrow, we need to add MODULUS
+#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
+  // Branchless correction: always compute diff + p, select based on borrow.
+  // Same rationale as fp_add -- avoids warp divergence.
+  //   borrow=1 -> a < b, need to add p -> use corrected
+  //   borrow=0 -> a >= b, result is valid -> use diff
+  Fp corrected;
+  fp_add_raw(corrected, diff, fp_modulus());
+  UNSIGNED_LIMB mask =
+      -borrow; // all-ones if borrow (use corrected), all-zeros if not
+
+  for (int i = 0; i < FP_LIMBS; i++) {
+    c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
+  }
+#else
+  // Host path: branching is fine on CPU
  const Fp &p = fp_modulus();
  if (borrow) {
    fp_add_raw(c, diff, p);
  } else {
    fp_copy(c, diff);
  }
+#endif
 }

 // Small-constant multiplication via addition chains.
@@ -453,23 +534,223 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
  }
 }

+// ============================================================================
+// PTX-accelerated CIOS Montgomery multiplication (device path)
+// ============================================================================
+// The CIOS algorithm for 7 x 64-bit limbs executes 98 multiply-accumulate
+// steps across 7 outer iterations. Each step computes:
+//   (carry, t[j]) = t[j] + a[j] * b_i + carry
+// which is a 64x64->128 multiply plus a three-operand addition with carry.
+//
+// The C++ path uses software carry detection: carry = (sum < old) ? 1 : 0.
+// The PTX path below uses hardware carry flags via the .cc suffix:
+//   - mul.lo.u64 / mul.hi.u64 : 64x64->128 wide multiply
+//   - add.cc.u64 / addc.u64   : addition chain with hardware carry flag
+//
+// Each multiply-accumulate step uses 6 PTX instructions instead of ~10+ in
+// the software-carry version. The 7 outer iterations are fully unrolled, and
+// the limb-shift loop (t[j] = t[j+1]) is eliminated by register renaming.
+//
+// REGISTER ALIASING NOTE: All PTX temporaries (_lo, _hi) are declared as
+// .reg inside the asm block. This prevents nvcc's register allocator from
+// aliasing them with C operands (t_j, carry), which was the root cause of
+// previous correctness bugs where "+l" outputs could share registers with
+// "l" inputs in the same asm statement.
+// ============================================================================
+
+#ifdef __CUDA_ARCH__
+#if LIMB_BITS_CONFIG == 64
+
+// Multiply-accumulate one limb: (carry_out, t_j) = t_j + a_j * b_i + carry_in
+//
+// All intermediates (_lo, _hi) are PTX .reg temporaries inside a { } scope
+// block to avoid: (1) nvcc register aliasing between C operands, and (2)
+// duplicate .reg definitions when the macro is expanded multiple times.
+// The 6-instruction sequence:
+//   mul.lo.u64  _lo, a_j, b_i      -- low 64 bits of product
+//   mul.hi.u64  _hi, a_j, b_i      -- high 64 bits of product
+//   add.cc.u64  t_j, t_j, _lo      -- t_j += _lo, set CF
+//   addc.u64    _hi, _hi, 0        -- _hi += CF
+//   add.cc.u64  t_j, t_j, carry    -- t_j += carry_in, set CF
+//   addc.u64    carry, _hi, 0      -- carry_out = _hi + CF
+#define LIMB_MACC(t_j, carry, a_j, b_i)                                        \
+  asm volatile("{\n\t"                                                         \
+               ".reg .u64 _lo, _hi;\n\t"                                       \
+               "mul.lo.u64  _lo, %2, %3;\n\t"                                  \
+               "mul.hi.u64  _hi, %2, %3;\n\t"                                  \
+               "add.cc.u64  %0, %0, _lo;\n\t"                                  \
+               "addc.u64    _hi, _hi, 0;\n\t"                                  \
+               "add.cc.u64  %0, %0, %1;\n\t"                                   \
+               "addc.u64    %1, _hi, 0;\n\t"                                   \
+               "}\n\t"                                                         \
+               : "+l"(t_j), "+l"(carry)                                        \
+               : "l"(a_j), "l"(b_i))
+
+// Single CIOS iteration: multiply-accumulate, reduce, and shift.
+//
+// Computes:
+//   1. t += a * b_i  (7 limb multiply-accumulate with carry chain)
+//   2. m = t[0] * p_prime  (Montgomery reduction factor)
+//   3. t += m * p  (reduction, zeros out t[0])
+//   4. Shift t right by one limb (via register renaming into r0..r7)
+//
+// The macro lets the compiler allocate registers across all 7 unrolled
+// iterations, avoiding spills to local memory.
+#define CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, \
+                           a5, a6, b_i, p0, p1, p2, p3, p4, p5, p6, p_prime,   \
+                           r0, r1, r2, r3, r4, r5, r6, r7)                     \
+  do {                                                                         \
+    uint64_t _carry = 0;                                                       \
+    /* Step 1: t += a * b_i */                                                 \
+    LIMB_MACC(t0, _carry, a0, b_i);                                            \
+    LIMB_MACC(t1, _carry, a1, b_i);                                            \
+    LIMB_MACC(t2, _carry, a2, b_i);                                            \
+    LIMB_MACC(t3, _carry, a3, b_i);                                            \
+    LIMB_MACC(t4, _carry, a4, b_i);                                            \
+    LIMB_MACC(t5, _carry, a5, b_i);                                            \
+    LIMB_MACC(t6, _carry, a6, b_i);                                            \
+    /* Accumulate final carry into overflow limb t7 */                         \
+    uint64_t _overflow;                                                        \
+    asm("add.cc.u64  %0, %0, %2;\n\t"                                          \
+        "addc.u64    %1, 0, 0;\n\t"                                            \
+        : "+l"(t7), "=l"(_overflow)                                            \
+        : "l"(_carry));                                                        \
+                                                                               \
+    /* Step 2: m = t0 * p_prime mod 2^64 */                                    \
+    uint64_t _m = t0 * p_prime;                                                \
+                                                                               \
+    /* Step 3: t += m * p (zeros out t0) */                                    \
+    _carry = 0;                                                                \
+    LIMB_MACC(t0, _carry, _m, p0);                                             \
+    LIMB_MACC(t1, _carry, _m, p1);                                             \
+    LIMB_MACC(t2, _carry, _m, p2);                                             \
+    LIMB_MACC(t3, _carry, _m, p3);                                             \
+    LIMB_MACC(t4, _carry, _m, p4);                                             \
+    LIMB_MACC(t5, _carry, _m, p5);                                             \
+    LIMB_MACC(t6, _carry, _m, p6);                                             \
+    /* Finalize overflow: t7 = t7 + _carry + _overflow */                      \
+    /* Plain adds (no carry chain) -- the CIOS invariant guarantees this */    \
+    /* sum fits in 64 bits so intermediate overflow does not matter. */        \
+    t7 += _carry;                                                              \
+    t7 += _overflow;                                                           \
+                                                                               \
+    /* Step 4: Shift right by one limb via register renaming */                \
+    /* t0 is now zero (by construction of m), discard it */                    \
+    r0 = t1;                                                                   \
+    r1 = t2;                                                                   \
+    r2 = t3;                                                                   \
+    r3 = t4;                                                                   \
+    r4 = t5;                                                                   \
+    r5 = t6;                                                                   \
+    r6 = t7;                                                                   \
+    r7 = 0;                                                                    \
+  } while (0)
+
+__device__ __noinline__ void fp_mont_mul_cios_ptx(Fp &c, const Fp &a,
+                                                  const Fp &b) {
+  const uint64_t p0 = DEVICE_MODULUS.limb[0];
+  const uint64_t p1 = DEVICE_MODULUS.limb[1];
+  const uint64_t p2 = DEVICE_MODULUS.limb[2];
+  const uint64_t p3 = DEVICE_MODULUS.limb[3];
+  const uint64_t p4 = DEVICE_MODULUS.limb[4];
+  const uint64_t p5 = DEVICE_MODULUS.limb[5];
+  const uint64_t p6 = DEVICE_MODULUS.limb[6];
+  const uint64_t pp = DEVICE_P_PRIME;
+
+  const uint64_t a0 = a.limb[0], a1 = a.limb[1], a2 = a.limb[2];
+  const uint64_t a3 = a.limb[3], a4 = a.limb[4], a5 = a.limb[5];
+  const uint64_t a6 = a.limb[6];
+
+  // Accumulator: 7 limbs + 1 overflow, initialized to zero
+  uint64_t t0 = 0, t1 = 0, t2 = 0, t3 = 0;
+  uint64_t t4 = 0, t5 = 0, t6 = 0, t7 = 0;
+
+  // 7 fully-unrolled CIOS iterations with register renaming for the shift.
+  // Each iteration processes one limb of b, accumulates a*b[i], reduces,
+  // and shifts. The output registers become the input for the next iteration.
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[0], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[1], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[2], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[3], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[4], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[5], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
+                     b.limb[6], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
+                     t4, t5, t6, t7);
+
+  // Final reduction: if t[0..7] >= p (extended to 8 limbs), subtract p.
+  // Compute (t[0..6] - p[0..6]) with borrow, then subtract borrow from t7.
+  // If t7 after subtraction is non-negative, the reduced result is valid;
+  // otherwise the original t[0..6] is already in [0, p).
+  uint64_t r0, r1, r2, r3, r4, r5, r6, mask;
+  asm("sub.cc.u64   %0, %8,  %15;\n\t" // r0 = t0 - p0
+      "subc.cc.u64  %1, %9,  %16;\n\t" // r1 = t1 - p1 - borrow
+      "subc.cc.u64  %2, %10, %17;\n\t" // r2 = t2 - p2 - borrow
+      "subc.cc.u64  %3, %11, %18;\n\t" // r3 = t3 - p3 - borrow
+      "subc.cc.u64  %4, %12, %19;\n\t" // r4 = t4 - p4 - borrow
+      "subc.cc.u64  %5, %13, %20;\n\t" // r5 = t5 - p5 - borrow
+      "subc.cc.u64  %6, %14, %21;\n\t" // r6 = t6 - p6 - borrow
+      "subc.u64     %7, %22, 0;\n\t"   // mask_src = t7 - 0 - borrow
+      "shr.s64      %7, %7, 63;\n\t" // mask = sign-extend: -1 if negative, 0 if
+                                     // >= 0
+      : "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
+        "=l"(mask)
+      : "l"(t0), "l"(t1), "l"(t2), "l"(t3), "l"(t4), "l"(t5), "l"(t6), "l"(p0),
+        "l"(p1), "l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6), "l"(t7));
+
+  // Branchless selection:
+  //   mask = 0  -> t >= p (use reduced r[0..6])
+  //   mask = -1 -> t < p  (keep original t[0..6])
+  c.limb[0] = (t0 & mask) | (r0 & ~mask);
+  c.limb[1] = (t1 & mask) | (r1 & ~mask);
+  c.limb[2] = (t2 & mask) | (r2 & ~mask);
+  c.limb[3] = (t3 & mask) | (r3 & ~mask);
+  c.limb[4] = (t4 & mask) | (r4 & ~mask);
+  c.limb[5] = (t5 & mask) | (r5 & ~mask);
+  c.limb[6] = (t6 & mask) | (r6 & ~mask);
+}
+
+#undef LIMB_MACC
+#undef CIOS_ITERATION_PTX
+
+#endif // LIMB_BITS_CONFIG == 64
+#endif // __CUDA_ARCH__
+
 // CIOS (Coarsely Integrated Operand Scanning) Montgomery multiplication
 // Fuses multiplication and reduction in a single pass for better efficiency.
 // Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
 // Both a and b are in Montgomery form, result is in Montgomery form.
 __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
+#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
+  // Device path: fully unrolled PTX with hardware carry flags
+  fp_mont_mul_cios_ptx(c, a, b);
+#else
+  // Host path: portable C++ implementation
  const Fp &p = fp_modulus();
  UNSIGNED_LIMB p_prime = fp_p_prime();

  // Working array: only n+1 limbs needed (vs 2n for separate mul+reduce)
  UNSIGNED_LIMB t[FP_LIMBS + 1];
-#ifdef __CUDA_ARCH__
-  for (int i = 0; i < FP_LIMBS + 1; i++) {
-    t[i] = 0;
-  }
-#else
  memset(t, 0, (FP_LIMBS + 1) * sizeof(UNSIGNED_LIMB));
-#endif

  // Main CIOS loop: for each limb of b
  for (int i = 0; i < FP_LIMBS; i++) {
@@ -529,14 +810,7 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
  }

  // Copy result to output
-#ifdef __CUDA_ARCH__
-#pragma unroll
-  for (int i = 0; i < FP_LIMBS; i++) {
-    c.limb[i] = t[i];
-  }
-#else
  memcpy(&c.limb[0], t, FP_LIMBS * sizeof(UNSIGNED_LIMB));
-#endif

  // Final reduction: if result >= p or there's overflow, subtract p
  if (t[FP_LIMBS] != 0 || fp_cmp(c, p) != ComparisonType::Less) {
@@ -545,6 +819,7 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
    fp_copy(c, reduced);
  }
  // Result is in Montgomery form
+#endif
 }

 // Montgomery multiplication: c = (a * b * R_INV) mod p
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
@@ -23,7 +23,8 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)

 # Build device library from tfhe-cuda-backend
-add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
+add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
+                                     ${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
 set_target_properties(
  tfhe_device_bench
  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_msm.cu
@@ -140,14 +140,14 @@ static void BM_G1_MSM(benchmark::State &state) {
  // Warm-up iterations
  for (int i = 0; i < WARMUP_ITERATIONS; i++) {
    point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+                       d_scalars, n, d_scratch);
  }
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

  // Benchmark loop: only measure the MSM computation, no memory operations
  for (auto _ : state) {
    point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+                       d_scalars, n, d_scratch);
    benchmark::ClobberMemory();
  }

@@ -221,14 +221,14 @@ static void BM_G2_MSM(benchmark::State &state) {
  // Warm-up iterations
  for (int i = 0; i < WARMUP_ITERATIONS; i++) {
    point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+                       d_scalars, n, d_scratch);
  }
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

  // Benchmark loop: only measure the MSM computation, no memory operations
  for (auto _ : state) {
    point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+                       d_scalars, n, d_scratch);
    benchmark::ClobberMemory();
  }

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
@@ -20,7 +20,8 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)

 # Build device library from tfhe-cuda-backend
-add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
+add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
+                               ${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
 set_target_properties(
  tfhe_device
  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_curve_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_curve_ops.cu
@@ -13,8 +13,8 @@
 //   ./build/tests_and_benchmarks/tests/basic/basic_curve_ops

 #include "curve.h"
+#include "device.h"
 #include "fp.h"
-#include <cassert>
 #include <cstdio>
 #include <cstring>

@@ -24,7 +24,7 @@ int main() {
  // (non-Montgomery) form. Convert to Montgomery, then lift to projective for
  // host-side arithmetic.
  const G1Affine &gen_normal = g1_generator();
-  assert(!g1_is_infinity(gen_normal));
+  PANIC_IF_FALSE(!g1_is_infinity(gen_normal), "generator must not be infinity");

  G1Affine gen_affine = gen_normal;
  point_to_montgomery_inplace(gen_affine);
@@ -37,21 +37,21 @@ int main() {

  // G + (-G) = identity (Z = 0 in the projective convention)
  G1Projective identity = G + neg_G;
-  assert(fp_is_zero(identity.Z));
+  PANIC_IF_FALSE(fp_is_zero(identity.Z), "G + (-G) must be identity (Z = 0)");
  printf("Negation (-G) and G + (-G) = identity: OK\n");

  // ---- Addition: 2*G = G + G, 3*G = 2*G + G ----
  G1Projective two_G = G + G;
-  assert(!(two_G == G1Projective())); // not the identity
+  PANIC_IF_FALSE(!(two_G == G1Projective{}), "2*G must not be identity");

  G1Projective three_G = two_G + G;
-  assert(!(three_G == G1Projective()));
+  PANIC_IF_FALSE(!(three_G == G1Projective{}), "3*G must not be identity");
  printf("Addition (2*G, 3*G): OK\n");

  // ---- Compound assignment: G += G ----
  G1Projective acc = G;
  acc += G; // acc = 2*G
-  assert(acc == two_G);
+  PANIC_IF_FALSE(acc == two_G, "G += G must equal 2*G");
  printf("Compound assignment (+=): OK\n");

  // ---- Scalar multiplication: 3*G using Scalar type ----
@@ -61,19 +61,22 @@ int main() {
  scalar_3.limb[0] = 3;

  G1Projective three_G_via_scalar = G * scalar_3;
-  assert(!(three_G_via_scalar == G1Projective()));
+  PANIC_IF_FALSE(!(three_G_via_scalar == G1Projective{}),
+                 "3*G via scalar must not be identity");

  // Normalise both to Z = 1 (Montgomery) before comparing coordinates.
  normalize_projective_g1(three_G);
  normalize_projective_g1(three_G_via_scalar);
-  assert(three_G == three_G_via_scalar);
+  PANIC_IF_FALSE(three_G == three_G_via_scalar,
+                 "3*G via addition must equal 3*G via scalar multiply");
  printf("Scalar multiplication (3*G == G + G + G): OK\n");

  // ---- Projective -> affine conversion ----
  // projective_to_affine_g1 keeps coordinates in Montgomery form.
  G1Affine three_G_affine;
  projective_to_affine_g1(three_G_affine, three_G);
-  assert(!g1_is_infinity(three_G_affine));
+  PANIC_IF_FALSE(!g1_is_infinity(three_G_affine),
+                 "3*G in affine must not be infinity");
  printf("Projective -> affine conversion: OK\n");

  // ---- Convert to normal-form coordinates ----
@@ -82,7 +85,8 @@ int main() {
  G1Projective result = three_G_via_scalar;
  normalize_from_montgomery_g1(
      result); // coordinates now in normal (non-Montgomery) form
-  assert(!fp_is_zero(result.Z)); // Z = 1 (non-zero)
+  PANIC_IF_FALSE(!fp_is_zero(result.Z),
+                 "normalized result must have non-zero Z");
  printf("Conversion to normal-form projective: OK\n");

  printf("All G1 curve operations passed.\n");
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_fp_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_fp_ops.cu
@@ -11,8 +11,8 @@
 //   cmake --build build --target basic_fp_ops
 //   ./build/tests_and_benchmarks/tests/basic/basic_fp_ops

+#include "device.h"
 #include "fp.h"
-#include <cassert>
 #include <cstdio>

 int main() {
@@ -25,16 +25,16 @@ int main() {
  fp_one(b); // b = 1

  c = a + b; // c = 2
-  assert(c.limb[0] == 2);
+  PANIC_IF_FALSE(c.limb[0] == 2, "1 + 1 must equal 2");

  c = c - a; // c = 1
-  assert(fp_is_one(c));
+  PANIC_IF_FALSE(fp_is_one(c), "2 - 1 must equal 1");

  // Compound assignment
  c += a; // c = 2
-  assert(c.limb[0] == 2);
+  PANIC_IF_FALSE(c.limb[0] == 2, "1 += 1 must equal 2");
  c -= b; // c = 1
-  assert(fp_is_one(c));
+  PANIC_IF_FALSE(fp_is_one(c), "2 -= 1 must equal 1");

  printf("Addition/subtraction: OK\n");

@@ -43,7 +43,7 @@ int main() {
  // form, but for add/sub/neg small normal-form values also work correctly.
  Fp neg_a = -a; // neg_a = -1 mod p
  Fp sum = a + neg_a;
-  assert(fp_is_zero(sum)); // 1 + (-1) = 0
+  PANIC_IF_FALSE(fp_is_zero(sum), "1 + (-1) must equal 0");
  printf("Negation: OK\n");

  // ---- Multiplication (Montgomery form required) ----
@@ -56,17 +56,17 @@ int main() {

  result_m = one_m * two_m; // result_m = 2 in Montgomery form
  fp_from_montgomery(result, result_m);
-  assert(result.limb[0] == 2);
+  PANIC_IF_FALSE(result.limb[0] == 2, "1 * 2 must equal 2");

  result_m = two_m * two_m; // result_m = 4 in Montgomery form
  fp_from_montgomery(result, result_m);
-  assert(result.limb[0] == 4);
+  PANIC_IF_FALSE(result.limb[0] == 4, "2 * 2 must equal 4");

  // Compound multiplication
  result_m = two_m;
  result_m *= two_m; // result_m = 4
  fp_from_montgomery(result, result_m);
-  assert(result.limb[0] == 4);
+  PANIC_IF_FALSE(result.limb[0] == 4, "2 *= 2 must equal 4");

  // Convert an arbitrary normal-form value to Montgomery before multiplying
  Fp five_normal, five_m, twenty_five_m, twenty_five;
@@ -76,7 +76,7 @@ int main() {

  fp_mont_mul(twenty_five_m, five_m, five_m); // 5 * 5 = 25
  fp_from_montgomery(twenty_five, twenty_five_m);
-  assert(twenty_five.limb[0] == 25);
+  PANIC_IF_FALSE(twenty_five.limb[0] == 25, "5 * 5 must equal 25");

  printf("Multiplication: OK\n");

@@ -88,7 +88,7 @@ int main() {

  Fp one_check;
  fp_div(one_check, five_normal, five_normal); // 5 / 5 = 1
-  assert(fp_is_one(one_check));
+  PANIC_IF_FALSE(fp_is_one(one_check), "5 / 5 must equal 1");

  // Verify: 5 * 5^{-1} == 1  (using fp_div as a cross-check)
  Fp product;
@@ -98,7 +98,7 @@ int main() {
  fp_zero(two_normal);
  two_normal.limb[0] = 2;
  fp_div(product, two_normal, two_normal); // 2 / 2 = 1
-  assert(fp_is_one(product));
+  PANIC_IF_FALSE(fp_is_one(product), "2 / 2 must equal 1");

  printf("Inversion/division: OK\n");

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_msm.cu
@@ -20,7 +20,6 @@
 #include "device.h"
 #include "fp.h"
 #include "msm.h"
-#include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <vector>
@@ -33,7 +32,6 @@ int main() {

  const uint32_t gpu_index = 0;
  const uint32_t n = 4; // number of points / scalars
-  uint64_t size_tracker = 0;

  // ---- Prepare host-side points in Montgomery form ----
  // Use n doublings of the G1 generator: G, 2*G, 4*G, 8*G.
@@ -76,8 +74,7 @@ int main() {

  // ---- Run MSM (synchronous wrapper; result written directly to host) ----
  G1Projective h_result;
-  point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch,
-               size_tracker, true);
+  point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch);

  // ---- Verify against naive sequential computation on the host ----
  // Expected = sum over i of (scalar[i] * point[i]).
@@ -95,7 +92,8 @@ int main() {
  // Normalise to Z = 1 (Montgomery) before comparing projective coordinates.
  normalize_projective_g1(h_result);
  normalize_projective_g1(expected);
-  assert(h_result == expected);
+  PANIC_IF_FALSE(h_result == expected,
+                 "MSM result must match naive sequential computation");
  printf("MSM result matches naive sequential computation.\n");

  // ---- Cleanup ----
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp.cu
@@ -3,6 +3,7 @@
 #include "fp.h"
 #include "fp_helpers.h" // Include test-only batch operations and kernels
 #include <chrono>
+#include <cinttypes>
 #include <cstdint>
 #include <cstring>
 #include <cuda_runtime.h>
@@ -297,7 +298,7 @@ protected:

 // Test basic addition (on GPU)
 TEST_F(FpArithmeticTest, Addition) {
-  uint64_t size_tracker = 0;
+
  Fp a, b, c, c_cpu;

  // Test: 1 + 1 = 2
@@ -320,7 +321,7 @@ TEST_F(FpArithmeticTest, Addition) {

 // Test subtraction (on GPU)
 TEST_F(FpArithmeticTest, Subtraction) {
-  uint64_t size_tracker = 0;
+
  Fp a, b, c, a_cpu;

  // Test: 2 - 1 = 1
@@ -341,7 +342,7 @@ TEST_F(FpArithmeticTest, Subtraction) {

 // Test multiplication (on GPU)
 TEST_F(FpArithmeticTest, Multiplication) {
-  uint64_t size_tracker = 0;
+
  Fp five, three, result, expected;

  fp_zero(five);
@@ -370,7 +371,7 @@ TEST_F(FpArithmeticTest, Multiplication) {

 // Test negation (on GPU)
 TEST_F(FpArithmeticTest, Negation) {
-  uint64_t size_tracker = 0;
+
  Fp a, neg_a, result;

  fp_zero(a);
@@ -395,7 +396,7 @@ TEST_F(FpArithmeticTest, Negation) {

 // Test Montgomery conversion round-trip (on GPU)
 TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {
-  uint64_t size_tracker = 0;
+
  Fp value, mont_form, back, mont_form_cpu, back_cpu;

  fp_zero(value);
@@ -421,7 +422,7 @@ TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {

 // Test Montgomery multiplication (on GPU)
 TEST_F(FpArithmeticTest, MontgomeryMultiplication) {
-  uint64_t size_tracker = 0;
+
  Fp five, three, five_m, three_m, result_m, result, expected, result_cpu;

  fp_zero(five);
@@ -460,7 +461,7 @@ TEST_F(FpArithmeticTest, MontgomeryMultiplication) {

 // Test comparison operations (on GPU)
 TEST_F(FpArithmeticTest, Comparison) {
-  uint64_t size_tracker = 0;
+
  Fp five, three;

  fp_zero(five);
@@ -481,7 +482,7 @@ TEST_F(FpArithmeticTest, Comparison) {

 // Test zero and one (on GPU)
 TEST_F(FpArithmeticTest, ZeroAndOne) {
-  uint64_t size_tracker = 0;
+
  Fp zero, one;

  fp_zero(zero);
@@ -499,7 +500,7 @@ TEST_F(FpArithmeticTest, ZeroAndOne) {

 // Test copy (on GPU)
 TEST_F(FpArithmeticTest, Copy) {
-  uint64_t size_tracker = 0;
+
  Fp a, b, b_cpu;

  fp_zero(a);
@@ -522,7 +523,7 @@ TEST_F(FpArithmeticTest, Copy) {

 // Test conditional move (on GPU)
 TEST_F(FpArithmeticTest, ConditionalMove) {
-  uint64_t size_tracker = 0;
+
  Fp a, b, result, result_cpu;

  fp_zero(a);
@@ -563,7 +564,7 @@ TEST_F(FpArithmeticTest, ConditionalMove) {

 // Test multiplication by zero (on GPU)
 TEST_F(FpArithmeticTest, MultiplicationByZero) {
-  uint64_t size_tracker = 0;
+
  Fp a, zero, result, result_cpu;

  fp_zero(zero);
@@ -591,7 +592,7 @@ TEST_F(FpArithmeticTest, MultiplicationByZero) {

 // Test inversion (on GPU)
 TEST_F(FpArithmeticTest, Inversion) {
-  uint64_t size_tracker = 0;
+
  Fp a, a_inv, result, a_inv_cpu;

  fp_zero(a);
@@ -623,7 +624,7 @@ TEST_F(FpArithmeticTest, Inversion) {

 // Test inversion of one (on GPU)
 TEST_F(FpArithmeticTest, InversionOfOne) {
-  uint64_t size_tracker = 0;
+
  Fp one, one_inv, one_inv_cpu;

  fp_one(one);
@@ -645,7 +646,7 @@ TEST_F(FpArithmeticTest, InversionOfOne) {

 // Test division (on GPU)
 TEST_F(FpArithmeticTest, Division) {
-  uint64_t size_tracker = 0;
+
  Fp a, b, quotient, result;

  fp_zero(a);
@@ -678,7 +679,7 @@ TEST_F(FpArithmeticTest, Division) {

 // Test division by one (on GPU)
 TEST_F(FpArithmeticTest, DivisionByOne) {
-  uint64_t size_tracker = 0;
+
  Fp a, one, result;

  fp_one(one);
@@ -707,7 +708,7 @@ TEST_F(FpArithmeticTest, DivisionByOne) {

 // Test exponentiation with small exponent (on GPU)
 TEST_F(FpArithmeticTest, ExponentiationSmall) {
-  uint64_t size_tracker = 0;
+
  Fp base, result, expected, result_cpu;

  fp_zero(base);
@@ -734,7 +735,7 @@ TEST_F(FpArithmeticTest, ExponentiationSmall) {

 // Test exponentiation to power of one (on GPU)
 TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {
-  uint64_t size_tracker = 0;
+
  Fp base, result, result_cpu;

  fp_zero(base);
@@ -758,7 +759,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {

 // Test exponentiation to power of zero (on GPU)
 TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {
-  uint64_t size_tracker = 0;
+
  Fp base, result, one, result_cpu;

  fp_zero(base);
@@ -782,7 +783,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {

 // Test exponentiation with large exponent (Fermat's little theorem)
 TEST_F(FpArithmeticTest, ExponentiationFermat) {
-  uint64_t size_tracker = 0;
+
  Fp a, result;

  fp_zero(a);
@@ -798,7 +799,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermat) {

 // Test exponentiation: a^(p-1) = 1 mod p
 TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {
-  uint64_t size_tracker = 0;
+
  Fp a, result, one;

  fp_zero(a);
@@ -820,7 +821,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {

 // Test square root (on GPU)
 TEST_F(FpArithmeticTest, SquareRoot) {
-  uint64_t size_tracker = 0;
+
  Fp a, square, sqrt_result, verify, square_cpu, sqrt_result_cpu, verify_cpu;

  // Test: sqrt(a^2) = a or -a
@@ -872,6 +873,12 @@ TEST_F(FpArithmeticTest, SquareRoot) {
    // Also test on CPU for comparison
    Fp neg_a_cpu = -a;

+    // Verify GPU negation matches CPU negation
+    EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &neg_a, &neg_a_cpu),
+              ComparisonType::Equal)
+        << "GPU negation should match CPU negation";
+    cuda_synchronize_stream(stream, gpu_index);
+
    bool matches_a = (fp_cmp_gpu(stream, gpu_index, &sqrt_result, &a) ==
                      ComparisonType::Equal);
    cuda_synchronize_stream(stream, gpu_index);
@@ -891,7 +898,7 @@ TEST_F(FpArithmeticTest, SquareRoot) {

 // Test square root of zero (on GPU)
 TEST_F(FpArithmeticTest, SquareRootOfZero) {
-  uint64_t size_tracker = 0;
+
  Fp zero, result, result_cpu;
  fp_zero(zero);

@@ -913,7 +920,7 @@ TEST_F(FpArithmeticTest, SquareRootOfZero) {

 // Test square root of one (on GPU)
 TEST_F(FpArithmeticTest, SquareRootOfOne) {
-  uint64_t size_tracker = 0;
+
  Fp one, result, result_cpu;
  fp_one(one);

@@ -935,7 +942,7 @@ TEST_F(FpArithmeticTest, SquareRootOfOne) {

 // Test quadratic residue check (on GPU)
 TEST_F(FpArithmeticTest, IsQuadraticResidue) {
-  uint64_t size_tracker = 0;
+
  Fp a, square, square_cpu, zero;

  fp_zero(a);
@@ -971,7 +978,7 @@ TEST_F(FpArithmeticTest, IsQuadraticResidue) {
 // device. For now, we test individual conversions on GPU and verify with GPU
 // comparisons
 TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {
-  uint64_t size_tracker = 0;
+
  const int n = 10;
  Fp normal[n], montgomery[n], back[n];

@@ -1011,7 +1018,7 @@ TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {

 // Test 1: Addition that doesn't overflow (on GPU)
 TEST_F(FpArithmeticTest, LargeAddition1) {
-  uint64_t size_tracker = 0;
+
  // a = large value
  Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
                             0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
@@ -1044,7 +1051,7 @@ TEST_F(FpArithmeticTest, LargeAddition1) {

 // Test 2: Addition that triggers reduction (sum > p) (on GPU)
 TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {
-  uint64_t size_tracker = 0;
+
  // Use two large numbers that will trigger reduction
  // a + b should wrap around modulus
  Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
@@ -1076,7 +1083,7 @@ TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {

 // Test 3: Subtraction without borrow (on GPU)
 TEST_F(FpArithmeticTest, LargeSubtraction1) {
-  uint64_t size_tracker = 0;
+
  // a = large value
  Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
                             0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
@@ -1106,11 +1113,15 @@ TEST_F(FpArithmeticTest, LargeSubtraction1) {
            ComparisonType::Equal)
      << "GPU result should match CPU result";
  cuda_synchronize_stream(stream, gpu_index);
+  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &verify, &verify_cpu),
+            ComparisonType::Equal)
+      << "GPU subtraction roundtrip should match CPU roundtrip";
+  cuda_synchronize_stream(stream, gpu_index);
 }

 // Test 4: Subtraction with borrow (a < b) (on GPU)
 TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {
-  uint64_t size_tracker = 0;
+
  // a = 50
  Fp a = test_utils::make_fp(0x32ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL,
                             0x0ULL);
@@ -1144,7 +1155,7 @@ TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {

 // Test 5: Multiplication of large values (triggers reduction) (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication1) {
-  uint64_t size_tracker = 0;
+
  // a = 2^200 (bit 200 set)
  Fp a;
  fp_zero(a);
@@ -1190,7 +1201,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication1) {

 // Test 6: (p-1) * (p-1) = 1 (mod p) (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
-  uint64_t size_tracker = 0;
+
  // a = p - 1
  Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
                             0x824e6dc3e23acdeeULL, 0xf75a64bbac71602ULL,
@@ -1228,7 +1239,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {

 // Test 7: Multiplication with 2: a * 2 = a + a (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
-  uint64_t size_tracker = 0;
+
  // a = large value
  Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAE6ULL,
                             0x412736E1F11D66F7ULL, 0x7BAD325DD638B01ULL,
@@ -1264,11 +1275,15 @@ TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
            ComparisonType::Equal)
      << "GPU result should match CPU result";
  cuda_synchronize_stream(stream, gpu_index);
+  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &expected, &expected_cpu),
+            ComparisonType::Equal)
+      << "GPU addition should match CPU addition";
+  cuda_synchronize_stream(stream, gpu_index);
 }

 // Test 8: Large number squared (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
-  uint64_t size_tracker = 0;
+
  // a = large value
  Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
                             0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL);
@@ -1305,7 +1320,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication4Square) {

 // Test 9: Addition chain near modulus (on GPU)
 TEST_F(FpArithmeticTest, LargeAddition3Chain) {
-  uint64_t size_tracker = 0;
+
  // Start with p-1
  Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
                             0x824e6dc3e23acdeeULL, 0x0f75a64bbac71602ULL,
@@ -1332,7 +1347,7 @@ TEST_F(FpArithmeticTest, LargeAddition3Chain) {

 // Test 10: Complex multiplication with reduction (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
-  uint64_t size_tracker = 0;
+
  // a = large prime-like number
  Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
                             0x0123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
@@ -1386,7 +1401,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {

 // Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
 TEST_F(FpPropertyTest, AdditionAssociativity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1412,7 +1427,7 @@ TEST_F(FpPropertyTest, AdditionAssociativity) {

 // Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
 TEST_F(FpPropertyTest, MultiplicationAssociativity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) { // Fewer iterations due to multiplication cost
    Fp a = random_value();
    Fp b = random_value();
@@ -1438,7 +1453,7 @@ TEST_F(FpPropertyTest, MultiplicationAssociativity) {

 // Test distributivity: a * (b + c) = a*b + a*c (on GPU)
 TEST_F(FpPropertyTest, MultiplicationDistributivity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1465,7 +1480,7 @@ TEST_F(FpPropertyTest, MultiplicationDistributivity) {

 // Test addition commutativity with random values (on GPU)
 TEST_F(FpPropertyTest, AdditionCommutativityRandom) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1483,7 +1498,7 @@ TEST_F(FpPropertyTest, AdditionCommutativityRandom) {

 // Test multiplication commutativity with random values (on GPU)
 TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1501,7 +1516,7 @@ TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {

 // Test additive identity: a + 0 = a (on GPU)
 TEST_F(FpPropertyTest, AdditiveIdentity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp result;
@@ -1517,7 +1532,7 @@ TEST_F(FpPropertyTest, AdditiveIdentity) {

 // Test multiplicative identity: a * 1 = a (on GPU)
 TEST_F(FpPropertyTest, MultiplicativeIdentity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp result;
@@ -1533,7 +1548,7 @@ TEST_F(FpPropertyTest, MultiplicativeIdentity) {

 // Test additive inverse: a + (-a) = 0 (on GPU)
 TEST_F(FpPropertyTest, AdditiveInverse) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp neg_a, result;
@@ -1550,7 +1565,7 @@ TEST_F(FpPropertyTest, AdditiveInverse) {

 // Test double negation: -(-a) = a (on GPU)
 TEST_F(FpPropertyTest, DoubleNegation) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp neg_a, neg_neg_a;
@@ -1568,7 +1583,7 @@ TEST_F(FpPropertyTest, DoubleNegation) {

 // Test subtraction as addition of negation: a - b = a + (-b) (on GPU)
 TEST_F(FpPropertyTest, SubtractionAsNegation) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1590,7 +1605,7 @@ TEST_F(FpPropertyTest, SubtractionAsNegation) {

 // Test Montgomery form round-trip with random values (on GPU)
 TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp mont_form, back;
@@ -1607,7 +1622,7 @@ TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {

 // Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
 TEST_F(FpPropertyTest, MultiplicativeInverse) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1630,7 +1645,7 @@ TEST_F(FpPropertyTest, MultiplicativeInverse) {

 // Test division: (a / b) * b = a (on GPU)
 TEST_F(FpPropertyTest, DivisionProperty) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1654,7 +1669,7 @@ TEST_F(FpPropertyTest, DivisionProperty) {

 // Test division as multiplication by inverse: a / b = a * b^(-1) (on GPU)
 TEST_F(FpPropertyTest, DivisionAsInverse) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1682,7 +1697,7 @@ TEST_F(FpPropertyTest, DivisionAsInverse) {

 // Test exponentiation: (a^e1)^e2 = a^(e1*e2) for small exponents (on GPU)
 TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1716,7 +1731,7 @@ TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {

 // Test exponentiation: a^e1 * a^e2 = a^(e1+e2) (on GPU)
 TEST_F(FpPropertyTest, ExponentiationProduct) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1751,7 +1766,7 @@ TEST_F(FpPropertyTest, ExponentiationProduct) {

 // Test inversion of inversion: (a^(-1))^(-1) = a (on GPU)
 TEST_F(FpPropertyTest, DoubleInversion) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1775,7 +1790,7 @@ TEST_F(FpPropertyTest, DoubleInversion) {

 // Test square root property: sqrt(a^2) = a (for random a) (on GPU)
 TEST_F(FpPropertyTest, SquareRootProperty) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp square, sqrt_result, verify;
@@ -1819,7 +1834,7 @@ TEST_F(FpPropertyTest, SquareRootProperty) {
 // Test quadratic residue property: squares are always quadratic residues (on
 // GPU)
 TEST_F(FpPropertyTest, QuadraticResidueProperty) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp square;
@@ -1841,7 +1856,7 @@ TEST_F(FpPropertyTest, QuadraticResidueProperty) {

 // Test operations with p-1 (on GPU)
 TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {
-  uint64_t size_tracker = 0;
+
  // (p-1) + 1 = 0 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &modulus_minus_one, &one);
@@ -1868,7 +1883,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {

 // Test operations with p-2 (on GPU)
 TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {
-  uint64_t size_tracker = 0;
+
  // (p-2) + 1 = p-1 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &modulus_minus_two, &one);
@@ -1888,7 +1903,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {

 // Test operations with very small values (on GPU)
 TEST_F(FpEdgeCaseTest, VerySmallValues) {
-  uint64_t size_tracker = 0;
+
  Fp zero_val, one_val, two_val, three_val;
  fp_zero(zero_val);
  fp_one(one_val);
@@ -1930,7 +1945,7 @@ TEST_F(FpEdgeCaseTest, VerySmallValues) {

 // Test operations with max limb values (on GPU)
 TEST_F(FpEdgeCaseTest, MaxLimbValues) {
-  uint64_t size_tracker = 0;
+
  // Test that max_limb_value is valid
  EXPECT_TRUE(test_utils::is_valid_fp(max_limb_value))
      << "max_limb_value should be < p";
@@ -1953,7 +1968,7 @@ TEST_F(FpEdgeCaseTest, MaxLimbValues) {

 // Test operations with alternating bit patterns (on GPU)
 TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {
-  uint64_t size_tracker = 0;
+
  // Test that alternating_bits is valid
  EXPECT_TRUE(test_utils::is_valid_fp(alternating_bits))
      << "alternating_bits should be < p";
@@ -1976,7 +1991,7 @@ TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {

 // Test edge case: zero operations (on GPU)
 TEST_F(FpEdgeCaseTest, ZeroOperations) {
-  uint64_t size_tracker = 0;
+
  // 0 + 0 = 0 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &zero, &zero);
@@ -2006,7 +2021,7 @@ TEST_F(FpEdgeCaseTest, ZeroOperations) {

 // Test edge case: one operations (on GPU)
 TEST_F(FpEdgeCaseTest, OneOperations) {
-  uint64_t size_tracker = 0;
+
  // 1 + 1 = 2 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &one, &one);
@@ -2033,7 +2048,7 @@ TEST_F(FpEdgeCaseTest, OneOperations) {

 // Test fp_one_montgomery (on GPU)
 TEST_F(FpEdgeCaseTest, OneMontgomery) {
-  uint64_t size_tracker = 0;
+
  Fp one_mont, one_normal;
  fp_one(one_normal);
  fp_one_montgomery(one_mont);
@@ -2049,7 +2064,7 @@ TEST_F(FpEdgeCaseTest, OneMontgomery) {

 // Test repeated operations (stress test) (on GPU)
 TEST_F(FpEdgeCaseTest, RepeatedOperations) {
-  uint64_t size_tracker = 0;
+
  Fp a = test_utils::random_fp(rng);
  Fp result = a;

@@ -2091,7 +2106,7 @@ TEST_F(FpEdgeCaseTest, RepeatedOperations) {

 // Test CUDA kernel: array addition
 TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {
-  uint64_t size_tracker = 0;
+
  const int n = 1000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2126,7 +2141,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {

 // Test CUDA kernel: array multiplication
 TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
-  uint64_t size_tracker = 0;
+
  const int n = 1000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2166,7 +2181,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {

 // Test CUDA kernel: array addition with edge cases
 TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {
-  uint64_t size_tracker = 0;
+
  const int n = 100;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2216,7 +2231,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {

 // Test CUDA kernel: array multiplication with edge cases
 TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
-  uint64_t size_tracker = 0;
+
  const int n = 100;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2271,7 +2286,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {

 // Test CUDA kernel: large array
 TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
-  uint64_t size_tracker = 0;
+
  const int n = 10000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2312,7 +2327,7 @@ TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
 // Test CUDA kernel: boundary conditions for launch configuration
 // Tests that the "if (idx < n)" check works correctly at block boundaries
 TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {
-  uint64_t size_tracker = 0;
+
  // Test sizes that stress the launch configuration
  // threadsPerBlock = 256, so test around block boundaries
  std::vector<int> test_sizes = {1,   255, 256,  257,  511,
@@ -2352,7 +2367,7 @@ TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {

 // Test CUDA kernel: verify kernel actually launches (not just CPU fallback)
 TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {
-  uint64_t size_tracker = 0;
+
  const int n = 1000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2384,7 +2399,7 @@ TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {

 // Test CUDA kernel: verify device constant memory is accessible
 TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
-  uint64_t size_tracker = 0;
+
  // This test verifies that DEVICE_MODULUS is properly initialized
  // by running a kernel that uses it (multiplication uses Montgomery which
  // needs modulus)
@@ -2428,7 +2443,7 @@ TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {

 // Test CUDA kernel: empty array (edge case)
 TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {
-  uint64_t size_tracker = 0;
+
  const int n = 0;
  Fp *h_a = nullptr;
  Fp *h_b = nullptr;
@@ -2445,7 +2460,7 @@ TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {

 // Test CUDA kernel: single element
 TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
-  uint64_t size_tracker = 0;
+
  const int n = 1;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2471,44 +2486,53 @@ TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
 // ============================================================================

 // Test to print generator values (for hardcoding)
+// PRIx64 format specifiers require 64-bit limbs
+#if LIMB_BITS_CONFIG == 64
 TEST_F(FpArithmeticTest, PrintGenerators) {
-  uint64_t size_tracker = 0;
+
  const G1Affine &g1 = g1_generator();
  const G2Affine &g2 = g2_generator();

  printf("\n=== G1 Generator (Montgomery form) ===\n");
-  printf("x: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
-         "0x%llxULL, 0x%llxULL}\n",
+  printf("x: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL}\n",
         g1.x.limb[0], g1.x.limb[1], g1.x.limb[2], g1.x.limb[3], g1.x.limb[4],
         g1.x.limb[5], g1.x.limb[6]);
-  printf("y: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
-         "0x%llxULL, 0x%llxULL}\n",
+  printf("y: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL}\n",
         g1.y.limb[0], g1.y.limb[1], g1.y.limb[2], g1.y.limb[3], g1.y.limb[4],
         g1.y.limb[5], g1.y.limb[6]);

  printf("\n=== G2 Generator (Montgomery form) ===\n");
-  printf("x.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
-         "0x%llxULL, 0x%llxULL}\n",
+  printf("x.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL}\n",
         g2.x.c0.limb[0], g2.x.c0.limb[1], g2.x.c0.limb[2], g2.x.c0.limb[3],
         g2.x.c0.limb[4], g2.x.c0.limb[5], g2.x.c0.limb[6]);
-  printf("x.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
-         "0x%llxULL, 0x%llxULL}\n",
+  printf("x.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL}\n",
         g2.x.c1.limb[0], g2.x.c1.limb[1], g2.x.c1.limb[2], g2.x.c1.limb[3],
         g2.x.c1.limb[4], g2.x.c1.limb[5], g2.x.c1.limb[6]);
-  printf("y.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
-         "0x%llxULL, 0x%llxULL}\n",
+  printf("y.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL}\n",
         g2.y.c0.limb[0], g2.y.c0.limb[1], g2.y.c0.limb[2], g2.y.c0.limb[3],
         g2.y.c0.limb[4], g2.y.c0.limb[5], g2.y.c0.limb[6]);
-  printf("y.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
-         "0x%llxULL, 0x%llxULL}\n",
+  printf("y.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
+         "0x%" PRIx64 "ULL}\n",
         g2.y.c1.limb[0], g2.y.c1.limb[1], g2.y.c1.limb[2], g2.y.c1.limb[3],
         g2.y.c1.limb[4], g2.y.c1.limb[5], g2.y.c1.limb[6]);
  printf("\n");
 }
+#endif

 // Test is_on_curve_g1 with point at infinity
 TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
-  uint64_t size_tracker = 0;
+
  G1Affine point;
  g1_point_at_infinity(point);

@@ -2520,7 +2544,7 @@ TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
 // We'll create a point by starting with a valid y and computing x
 // Or use a known valid point
 TEST_F(FpArithmeticTest, CurveG1ValidPoint) {
-  uint64_t size_tracker = 0;
+
  G1Affine point;
  point.infinity = false;

@@ -2593,7 +2617,7 @@ TEST_F(FpArithmeticTest, CurveG1ValidPoint) {

 // Test is_on_curve_g1 with invalid point
 TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {
-  uint64_t size_tracker = 0;
+
  G1Affine point;
  point.infinity = false;

@@ -2606,7 +2630,7 @@ TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {

 // Test that negating y preserves curve validity (on GPU)
 TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {
-  uint64_t size_tracker = 0;
+
  G1Affine point;
  point.infinity = false;

@@ -2650,7 +2674,7 @@ TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {

 // Test is_on_curve_g2 with point at infinity
 TEST_F(FpArithmeticTest, CurveG2PointAtInfinity) {
-  uint64_t size_tracker = 0;
+
  G2Affine point;
  g2_point_at_infinity(point);

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp2.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp2.cu
@@ -152,7 +152,7 @@ protected:

 // Test basic addition (on GPU)
 TEST_F(Fp2ArithmeticTest, Addition) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, b, c, c_cpu;

  // Test: (1 + 0*i) + (1 + 0*i) = (2 + 0*i)
@@ -175,7 +175,7 @@ TEST_F(Fp2ArithmeticTest, Addition) {

 // Test subtraction (on GPU)
 TEST_F(Fp2ArithmeticTest, Subtraction) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, b, c, a_cpu;

  // Test: (2 + 0*i) - (1 + 0*i) = (1 + 0*i)
@@ -197,7 +197,7 @@ TEST_F(Fp2ArithmeticTest, Subtraction) {

 // Test multiplication (on GPU)
 TEST_F(Fp2ArithmeticTest, Multiplication) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, b, result, expected, result_cpu;

  // Test: (1 + 1*i) * (1 + 1*i) = (0 + 2*i)
@@ -224,7 +224,7 @@ TEST_F(Fp2ArithmeticTest, Multiplication) {

 // Test i * i = -1 (on GPU)
 TEST_F(Fp2ArithmeticTest, I_Squared) {
-  uint64_t size_tracker = 0;
+
  Fp2 i_val, result, expected, result_cpu;

  // i = 0 + 1*i
@@ -250,7 +250,7 @@ TEST_F(Fp2ArithmeticTest, I_Squared) {

 // Test negation (on GPU)
 TEST_F(Fp2ArithmeticTest, Negation) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, neg_a, result, neg_a_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -273,7 +273,7 @@ TEST_F(Fp2ArithmeticTest, Negation) {

 // Test conjugation (on GPU)
 TEST_F(Fp2ArithmeticTest, Conjugation) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, conj, result, conj_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -304,7 +304,7 @@ TEST_F(Fp2ArithmeticTest, Conjugation) {

 // Test squaring (on GPU)
 TEST_F(Fp2ArithmeticTest, Squaring) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, square, square_cpu;

  // Test: (1 + 1*i)^2 = 2*i
@@ -327,7 +327,7 @@ TEST_F(Fp2ArithmeticTest, Squaring) {

 // Test zero and one (on GPU)
 TEST_F(Fp2ArithmeticTest, ZeroAndOne) {
-  uint64_t size_tracker = 0;
+
  Fp2 zero_val, one_val;

  fp2_zero(zero_val);
@@ -349,7 +349,7 @@ TEST_F(Fp2ArithmeticTest, ZeroAndOne) {

 // Test copy (on GPU)
 TEST_F(Fp2ArithmeticTest, Copy) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, b, b_cpu;

  a = test_utils_fp2::make_fp2_simple(42, 123);
@@ -370,7 +370,7 @@ TEST_F(Fp2ArithmeticTest, Copy) {

 // Test conditional move (on GPU)
 TEST_F(Fp2ArithmeticTest, ConditionalMove) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, b, result, result_cpu;

  a = test_utils_fp2::make_fp2_simple(10, 20);
@@ -411,7 +411,7 @@ TEST_F(Fp2ArithmeticTest, ConditionalMove) {

 // Test multiplication by zero (on GPU)
 TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, zero_val, result, result_cpu;

  fp2_zero(zero_val);
@@ -432,7 +432,7 @@ TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {

 // Test inversion (on GPU)
 TEST_F(Fp2ArithmeticTest, Inversion) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, a_inv, result, a_inv_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -456,7 +456,7 @@ TEST_F(Fp2ArithmeticTest, Inversion) {

 // Test division (on GPU)
 TEST_F(Fp2ArithmeticTest, Division) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, b, quotient, result, quotient_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(10, 6);
@@ -482,7 +482,7 @@ TEST_F(Fp2ArithmeticTest, Division) {

 // Test multiply by i (on GPU)
 TEST_F(Fp2ArithmeticTest, MultiplyByI) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, result, result_cpu;

  // Test: (a + b*i) * i = -b + a*i
@@ -509,7 +509,7 @@ TEST_F(Fp2ArithmeticTest, MultiplyByI) {

 // Test Frobenius map (on GPU)
 TEST_F(Fp2ArithmeticTest, Frobenius) {
-  uint64_t size_tracker = 0;
+
  Fp2 a, frob, conj, frob_cpu, conj_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -541,7 +541,7 @@ TEST_F(Fp2ArithmeticTest, Frobenius) {

 // Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
 TEST_F(Fp2PropertyTest, AdditionAssociativity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -567,7 +567,7 @@ TEST_F(Fp2PropertyTest, AdditionAssociativity) {

 // Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -593,7 +593,7 @@ TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {

 // Test distributivity: a * (b + c) = a*b + a*c (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -620,7 +620,7 @@ TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {

 // Test addition commutativity (on GPU)
 TEST_F(Fp2PropertyTest, AdditionCommutativity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -638,7 +638,7 @@ TEST_F(Fp2PropertyTest, AdditionCommutativity) {

 // Test multiplication commutativity (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -656,7 +656,7 @@ TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {

 // Test additive identity: a + 0 = a (on GPU)
 TEST_F(Fp2PropertyTest, AdditiveIdentity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 result;
@@ -672,7 +672,7 @@ TEST_F(Fp2PropertyTest, AdditiveIdentity) {

 // Test multiplicative identity: a * 1 = a (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 result;
@@ -688,7 +688,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {

 // Test additive inverse: a + (-a) = 0 (on GPU)
 TEST_F(Fp2PropertyTest, AdditiveInverse) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 neg_a, result;
@@ -705,7 +705,7 @@ TEST_F(Fp2PropertyTest, AdditiveInverse) {

 // Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicativeInverse) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    // Skip zero
@@ -726,7 +726,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeInverse) {

 // Test square vs multiply by self: a^2 = a * a (on GPU)
 TEST_F(Fp2PropertyTest, SquareVsMultiply) {
-  uint64_t size_tracker = 0;
+
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();

@@ -747,7 +747,7 @@ TEST_F(Fp2PropertyTest, SquareVsMultiply) {

 // Test CUDA kernel: array addition
 TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {
-  uint64_t size_tracker = 0;
+
  const int n = 1000;
  Fp2 *h_a = new Fp2[n];
  Fp2 *h_b = new Fp2[n];
@@ -784,7 +784,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {

 // Test CUDA kernel: array multiplication
 TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {
-  uint64_t size_tracker = 0;
+
  const int n = 1000;
  Fp2 *h_a = new Fp2[n];
  Fp2 *h_b = new Fp2[n];
@@ -825,7 +825,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {

 // Test is_on_curve_g2 with point at infinity
 TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {
-  uint64_t size_tracker = 0;
+
  G2Affine point;
  g2_point_at_infinity(point);

@@ -835,7 +835,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {

 // Test is_on_curve_g2 with valid point construction
 TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {
-  uint64_t size_tracker = 0;
+
  G2Affine point;
  point.infinity = false;

@@ -860,7 +860,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {

 // Test that field operations maintain curve validity for G2
 TEST_F(Fp2ArithmeticTest, CurveG2FieldOperationsConsistency) {
-  uint64_t size_tracker = 0;
+
  // Create a point (we'll test the consistency check works)
  G2Affine point;
  point.infinity = false;
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_msm.cu
@@ -24,8 +24,7 @@ static void test_point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
  auto *d_scratch =
      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
          scratch_bytes, stream, gpu_index, size_tracker, true));
-  point_msm_g1(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch,
-               size_tracker, true);
+  point_msm_g1(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch);
  cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
 }

@@ -37,8 +36,7 @@ static void test_point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
  auto *d_scratch =
      static_cast<G2Projective *>(cuda_malloc_with_size_tracking_async(
          scratch_bytes, stream, gpu_index, size_tracker, true));
-  point_msm_g2(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch,
-               size_tracker, true);
+  point_msm_g2(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch);
  cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
 }

--- a/backends/zk-cuda-backend/src/bindings.rs
+++ b/backends/zk-cuda-backend/src/bindings.rs
@@ -147,8 +147,6 @@ unsafe extern "C" {
        d_scalars: *const Scalar,
        n: u32,
        d_scratch: *mut G1ProjectivePoint,
-        gpu_memory_allocated: bool,
-        size_tracker: *mut u64,
    );
 }
 unsafe extern "C" {
@@ -160,8 +158,6 @@ unsafe extern "C" {
        d_scalars: *const Scalar,
        n: u32,
        d_scratch: *mut G2ProjectivePoint,
-        gpu_memory_allocated: bool,
-        size_tracker: *mut u64,
    );
 }
 unsafe extern "C" {
--- a/backends/zk-cuda-backend/src/c_wrapper.cu
+++ b/backends/zk-cuda-backend/src/c_wrapper.cu
@@ -11,6 +11,8 @@
 #include <stddef.h>
 #include <cstring>

+#include "../../tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh"
+
 // C++ helper functions (not exported, used internally)
 // These can call template functions since they have C++ linkage
 static void convert_g1_points_to_montgomery(cudaStream_t stream, uint32_t gpu_index, G1Affine* d_points, uint32_t n) {
@@ -64,8 +66,7 @@ bool g2_is_infinity_wrapper(const G2Affine* point) {
 }

 // Unmanaged MSM wrapper for G1 (points/scalars/scratch on device, result on host)
-// Points MUST be in Montgomery form. Caller provides scratch buffer and
-// controls allocation tracking via gpu_memory_allocated.
+// Points MUST be in Montgomery form. Caller provides scratch buffer.
 // Zero internal allocations — this is a thin validation + dispatch layer.
 void g1_msm_unmanaged_wrapper_async(
    cudaStream_t stream,
@@ -74,12 +75,9 @@ void g1_msm_unmanaged_wrapper_async(
    const G1Affine* d_points,
    const Scalar* d_scalars,
    uint32_t n,
-    G1Projective* d_scratch,
-    bool gpu_memory_allocated,
-    uint64_t* size_tracker
+    G1Projective* d_scratch
 ) {
-    PANIC_IF_FALSE(size_tracker != nullptr, "G1 MSM error: size_tracker is null");
-    uint64_t& size_tracker_ref = *size_tracker;
+    PUSH_RANGE("G1 MSM UNMANAGED");
    PANIC_IF_FALSE(n > 0, "G1 MSM error: n must be positive, got %u", n);
    PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
    PANIC_IF_FALSE(h_result != nullptr, "G1 MSM error: h_result is null");
@@ -91,13 +89,13 @@ void g1_msm_unmanaged_wrapper_async(
                   cuda_get_number_of_gpus());

    point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, gpu_memory_allocated);
+                       d_scratch);
    check_cuda_error(cudaGetLastError());
+    POP_RANGE();
 }

 // Unmanaged MSM wrapper for G2 (points/scalars/scratch on device, result on host)
-// Points MUST be in Montgomery form. Caller provides scratch buffer and
-// controls allocation tracking via gpu_memory_allocated.
+// Points MUST be in Montgomery form. Caller provides scratch buffer.
 // Zero internal allocations — this is a thin validation + dispatch layer.
 void g2_msm_unmanaged_wrapper_async(
    cudaStream_t stream,
@@ -106,12 +104,9 @@ void g2_msm_unmanaged_wrapper_async(
    const G2Affine* d_points,
    const Scalar* d_scalars,
    uint32_t n,
-    G2Projective* d_scratch,
-    bool gpu_memory_allocated,
-    uint64_t* size_tracker
+    G2Projective* d_scratch
 ) {
-    PANIC_IF_FALSE(size_tracker != nullptr, "G2 MSM error: size_tracker is null");
-    uint64_t& size_tracker_ref = *size_tracker;
+    PUSH_RANGE("G2 MSM UNMANAGED");
    PANIC_IF_FALSE(n > 0, "G2 MSM error: n must be positive, got %u", n);
    PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
    PANIC_IF_FALSE(h_result != nullptr, "G2 MSM error: h_result is null");
@@ -123,8 +118,9 @@ void g2_msm_unmanaged_wrapper_async(
                   cuda_get_number_of_gpus());

    point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, gpu_memory_allocated);
+                       d_scratch);
    check_cuda_error(cudaGetLastError());
+    POP_RANGE();
 }

 // Scratch size query wrappers (needed for bindgen `.*_wrapper` allowlist)
@@ -148,7 +144,8 @@ void g1_msm_managed_wrapper(
    bool points_in_montgomery,
    uint64_t* size_tracker
 ) {
-    uint64_t& size_tracker_ref = *size_tracker;
+    PUSH_RANGE("G1 MSM MANAGED");
+    uint64_t& size_tracker_local = *size_tracker;
    PANIC_IF_FALSE(n > 0, "G1 MSM error: n must be positive, got %u", n);
    PANIC_IF_FALSE(result != nullptr, "G1 MSM error: result is null");
    PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
@@ -165,8 +162,8 @@ void g1_msm_managed_wrapper(
    size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));

    // TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
-    auto* d_points = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
-    auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
+    auto* d_points = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_local, true));
+    auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_local, true));

    // Always copy points to GPU first
    cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
@@ -181,14 +178,14 @@ void g1_msm_managed_wrapper(
    // Allocate scratch buffer sized to match the pippenger internal partitioning
    size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
    auto* d_scratch = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(
-        scratch_bytes, stream, gpu_index, size_tracker_ref, true));
+        scratch_bytes, stream, gpu_index, size_tracker_local, true));

    PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
                   "G1 MSM error: device memory allocation failed");

    // Result written directly to host pointer -- no device round-trip needed
    point_msm_g1_async(stream, gpu_index, result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, true);
+                       d_scratch);
    check_cuda_error(cudaGetLastError());

    cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
@@ -197,6 +194,7 @@ void g1_msm_managed_wrapper(

    // Sync for the async frees above.
    cuda_synchronize_stream(stream, gpu_index);
+    POP_RANGE();
 }


@@ -212,7 +210,8 @@ void g2_msm_managed_wrapper(
    bool points_in_montgomery,
    uint64_t* size_tracker
 ) {
-    uint64_t& size_tracker_ref = *size_tracker;
+    PUSH_RANGE("G2 MSM MANAGED");
+    uint64_t& size_tracker_local = *size_tracker;
    PANIC_IF_FALSE(n > 0, "G2 MSM error: n must be positive, got %u", n);
    PANIC_IF_FALSE(result != nullptr, "G2 MSM error: result is null");
    PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
@@ -229,8 +228,8 @@ void g2_msm_managed_wrapper(
    size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));

    // TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
-    auto* d_points = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
-    auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
+    auto* d_points = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_local, true));
+    auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_local, true));

    cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
    cuda_memcpy_with_size_tracking_async_to_gpu(d_scalars, scalars, scalars_bytes, stream, gpu_index, true);
@@ -243,14 +242,14 @@ void g2_msm_managed_wrapper(
    // Allocate scratch buffer sized to match the pippenger internal partitioning
    size_t scratch_bytes = pippenger_scratch_size_g2(n, gpu_index);
    auto* d_scratch = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(
-        scratch_bytes, stream, gpu_index, size_tracker_ref, true));
+        scratch_bytes, stream, gpu_index, size_tracker_local, true));

    PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
                   "G2 MSM error: device memory allocation failed");

    // Result written directly to host pointer -- no device round-trip needed
    point_msm_g2_async(stream, gpu_index, result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, true);
+                       d_scratch);
    check_cuda_error(cudaGetLastError());

    cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
@@ -259,6 +258,7 @@ void g2_msm_managed_wrapper(

    // Sync for the async frees above.
    cuda_synchronize_stream(stream, gpu_index);
+    POP_RANGE();
 }

 void g1_from_montgomery_wrapper(G1Affine* result, const G1Affine* point) {
--- a/backends/zk-cuda-backend/src/include/api.h
+++ b/backends/zk-cuda-backend/src/include/api.h
@@ -109,9 +109,7 @@ void g1_msm_unmanaged_wrapper_async(
    const G1Point* d_points,
    const Scalar* d_scalars,
    uint32_t n,
-    G1ProjectivePoint* d_scratch,
-    bool gpu_memory_allocated,
-    uint64_t* size_tracker
+    G1ProjectivePoint* d_scratch
 );

 void g2_msm_unmanaged_wrapper_async(
@@ -121,9 +119,7 @@ void g2_msm_unmanaged_wrapper_async(
    const G2Point* d_points,
    const Scalar* d_scalars,
    uint32_t n,
-    G2ProjectivePoint* d_scratch,
-    bool gpu_memory_allocated,
-    uint64_t* size_tracker
+    G2ProjectivePoint* d_scratch
 );

 // Scratch size queries for Pippenger MSM
--- a/backends/zk-cuda-backend/src/types/g1.rs
+++ b/backends/zk-cuda-backend/src/types/g1.rs
@@ -234,8 +234,7 @@ impl G1Projective {
    ///
    /// The caller is responsible for creating and destroying the stream.
    ///
-    /// Returns the result and the size_tracker (GPU memory allocated in bytes) if successful,
-    /// or an error if MSM computation fails.
+    /// Returns the MSM result if successful, or an error if MSM computation fails.
    #[allow(clippy::not_unsafe_ptr_arg_deref)]
    #[must_use = "GPU MSM result must be handled"]
    pub fn msm(
@@ -244,14 +243,14 @@ impl G1Projective {
        stream: *mut std::ffi::c_void,
        gpu_index: u32,
        points_in_montgomery: bool,
-    ) -> Result<(Self, u64), String> {
+    ) -> Result<Self, String> {
        assert_eq!(
            points.len(),
            scalars.len(),
            "GPU MSM: points and scalars must have the same length"
        );
        if points.is_empty() {
-            return Ok((Self::infinity(), 0));
+            return Ok(Self::infinity());
        }
        if stream.is_null() {
            return Err("GPU MSM: stream pointer is null".to_string());
@@ -263,7 +262,6 @@ impl G1Projective {
        let points_ffi: Vec<G1Point> = points.iter().map(|p| p.inner).collect();
        let scalars_ffi: Vec<ScalarFFI> = scalars.iter().map(|s| *s.inner()).collect();
        let mut result = G1ProjectivePoint::default();
-        let mut size_tracker: u64 = 0;
        // NOTE: This method uses the managed API (g1_msm_managed_wrapper) which handles
        // memory allocation and transfers internally. For a pure-GPU verify/proof implementation
        // where all data is already on the device and memory is managed externally, use the
@@ -279,12 +277,13 @@ impl G1Projective {
        //   ownership. The caller remains responsible for destroying the stream after use.
        // - `gpu_index` is passed directly to CUDA; the C++ wrapper validates it
        // - `points_ffi` and `scalars_ffi` are valid Vec slices with matching length `n`
-        // - `result` and `size_tracker` are valid stack-allocated outputs
+        // - `result` is a valid stack-allocated output
        // - The managed wrapper handles all device memory allocation/deallocation internally
        // - Failure: The C++ managed wrapper validates all inputs via PANIC_IF_FALSE and checks
        //   CUDA errors via cudaGetLastError() after each kernel launch.
        // - Success: The C++ managed wrapper calls cuda_synchronize_stream before returning,
        //   ensuring `result` contains the final MSM output.
+        let mut size_tracker: u64 = 0;
        unsafe {
            crate::bindings::g1_msm_managed_wrapper(
                stream as crate::bindings::cudaStream_t,
@@ -298,7 +297,7 @@ impl G1Projective {
            );
        }

-        Ok((Self { inner: result }, size_tracker))
+        Ok(Self { inner: result })
    }
 }

--- a/backends/zk-cuda-backend/src/types/g2.rs
+++ b/backends/zk-cuda-backend/src/types/g2.rs
@@ -241,8 +241,7 @@ impl G2Projective {
    ///
    /// The caller is responsible for creating and destroying the stream.
    ///
-    /// Returns the result and the size_tracker (GPU memory allocated in bytes) if successful,
-    /// or an error if MSM computation fails.
+    /// Returns the MSM result if successful, or an error if MSM computation fails.
    #[allow(clippy::not_unsafe_ptr_arg_deref)]
    #[must_use = "GPU MSM result must be handled"]
    pub fn msm(
@@ -251,14 +250,14 @@ impl G2Projective {
        stream: *mut std::ffi::c_void,
        gpu_index: u32,
        points_in_montgomery: bool,
-    ) -> Result<(Self, u64), String> {
+    ) -> Result<Self, String> {
        assert_eq!(
            points.len(),
            scalars.len(),
            "GPU MSM: points and scalars must have the same length"
        );
        if points.is_empty() {
-            return Ok((Self::infinity(), 0));
+            return Ok(Self::infinity());
        }
        if stream.is_null() {
            return Err("GPU MSM: stream pointer is null".to_string());
@@ -270,8 +269,6 @@ impl G2Projective {
        let points_ffi: Vec<G2Point> = points.iter().map(|p| p.inner).collect();
        let scalars_ffi: Vec<ScalarFFI> = scalars.iter().map(|s| *s.inner()).collect();
        let mut result = G2ProjectivePoint::default();
-
-        let mut size_tracker: u64 = 0;
        // NOTE: This method uses the managed API (g2_msm_managed_wrapper) which handles
        // memory allocation and transfers internally. For a pure-GPU verify/proof implementation
        // where all data is already on the device and memory is managed externally, use the
@@ -287,12 +284,13 @@ impl G2Projective {
        //   ownership. The caller remains responsible for destroying the stream after use.
        // - `gpu_index` is passed directly to CUDA; the C++ wrapper validates it
        // - `points_ffi` and `scalars_ffi` are valid Vec slices with matching length `n`
-        // - `result` and `size_tracker` are valid stack-allocated outputs
+        // - `result` is a valid stack-allocated output
        // - The managed wrapper handles all device memory allocation/deallocation internally
        // - Failure: The C++ managed wrapper validates all inputs via PANIC_IF_FALSE and checks
        //   CUDA errors via cudaGetLastError() after each kernel launch.
        // - Success: The C++ managed wrapper calls cuda_synchronize_stream before returning,
        //   ensuring `result` contains the final MSM output.
+        let mut size_tracker: u64 = 0;
        unsafe {
            crate::bindings::g2_msm_managed_wrapper(
                stream as crate::bindings::cudaStream_t,
@@ -306,7 +304,7 @@ impl G2Projective {
            );
        }

-        Ok((Self { inner: result }, size_tracker))
+        Ok(Self { inner: result })
    }
 }

--- a/backends/zk-cuda-backend/src/types/mod.rs
+++ b/backends/zk-cuda-backend/src/types/mod.rs
@@ -178,13 +178,13 @@ mod tests {

    #[test]
    fn g1_msm_empty_returns_infinity() {
-        let (result, _) = G1Projective::msm(&[], &[], std::ptr::null_mut(), 0, false).unwrap();
+        let result = G1Projective::msm(&[], &[], std::ptr::null_mut(), 0, false).unwrap();
        assert!(result.to_affine().is_infinity());
    }

    #[test]
    fn g2_msm_empty_returns_infinity() {
-        let (result, _) = G2Projective::msm(&[], &[], std::ptr::null_mut(), 0, false).unwrap();
+        let result = G2Projective::msm(&[], &[], std::ptr::null_mut(), 0, false).unwrap();
        assert!(result.to_affine().is_infinity());
    }

@@ -194,7 +194,7 @@ mod tests {
        let one = Scalar::from_u64(1);

        let stream = unsafe { tfhe_cuda_backend::cuda_bind::cuda_create_stream(0) };
-        let (result, _) = G1Projective::msm(&[gen], &[one], stream, 0, false).unwrap();
+        let result = G1Projective::msm(&[gen], &[one], stream, 0, false).unwrap();
        unsafe { tfhe_cuda_backend::cuda_bind::cuda_destroy_stream(stream, 0) };

        // from_montgomery_normalized() normalizes (divides by Z in Montgomery form)
@@ -219,7 +219,7 @@ mod tests {
        let one = Scalar::from_u64(1);

        let stream = unsafe { tfhe_cuda_backend::cuda_bind::cuda_create_stream(0) };
-        let (result, _) = G2Projective::msm(&[gen], &[one], stream, 0, false).unwrap();
+        let result = G2Projective::msm(&[gen], &[one], stream, 0, false).unwrap();
        unsafe { tfhe_cuda_backend::cuda_bind::cuda_destroy_stream(stream, 0) };

        // Same approach: extract affine coordinates directly from normalized projective
--- a/ci/data_extractor/src/benchmark_specs.py
+++ b/ci/data_extractor/src/benchmark_specs.py
@@ -302,7 +302,7 @@ class BenchType(enum.StrEnum):

 class BenchSubset(enum.StrEnum):
    All = "all"
-    Erc20 = "erc20"
+    Erc7984 = "erc7984"
    Zk = "zk"

    @staticmethod
@@ -310,8 +310,8 @@ class BenchSubset(enum.StrEnum):
        match bench_subset.lower():
            case "all":
                return BenchSubset.All
-            case "erc20":
-                return BenchSubset.Erc20
+            case "erc7984":
+                return BenchSubset.Erc7984
            case "zk":
                return BenchSubset.Zk
            case _:
@@ -611,14 +611,14 @@ class BenchDetails:
                        # Case for arithmetic operations (add, sub, mul,...)
                        self.operation_name = "::".join(parts[2:-2])
                    else:
-                        # Case for higher-level operation (erc20 transfer, dex,...)
+                        # Case for higher-level operation (erc7984 transfer, dex,...)
                        self.operation_name = "::".join(parts[2:-1])
                else:
                    if "_PARAM_" in parts[-2]:
                        # Case for arithmetic operations (add, sub, mul,...)
                        self.operation_name = "::".join(parts[1:-2])
                    else:
-                        # Case for higher-level operation (erc20 transfer, dex,...)
+                        # Case for higher-level operation (erc7984 transfer, dex,...)
                        self.operation_name = "::".join(parts[1:-1])
                self.rust_type = parts[-1].partition("_mean")[0]
            case Layer.Shortint:
--- a/ci/data_extractor/src/data_extractor.py
+++ b/ci/data_extractor/src/data_extractor.py
@@ -137,7 +137,7 @@ parser.add_argument(
 parser.add_argument(
    "--bench-subset",
    dest="bench_subset",
-    choices=["all", "erc20", "zk"],
+    choices=["all", "erc7984", "zk"],
    default="all",
    help="Subset of benchmarks to filter against, dedicated formatting will be applied",
 )
@@ -285,8 +285,8 @@ def perform_hardware_comparison(

 def get_formatter(layer: Layer, bench_subset: BenchSubset):
    match bench_subset:
-        case BenchSubset.Erc20:
-            return formatters.hlapi.Erc20Formatter
+        case BenchSubset.Erc7984:
+            return formatters.hlapi.Erc7984Formatter
        case BenchSubset.Zk:
            if layer == Layer.Wasm:
                return formatters.wasm.ZKFormatter
@@ -442,7 +442,7 @@ def get_operands_types(layer: Layer, bench_subset: BenchSubset = None):
        return ciphertext_only
    elif bench_subset:
        match bench_subset:
-            case BenchSubset.Zk | BenchSubset.Erc20:
+            case BenchSubset.Zk | BenchSubset.Erc7984:
                return ciphertext_only
            case BenchSubset.All:
                return ciphertext_and_plaintext
--- a/ci/data_extractor/src/formatters/hlapi/hlapi.py
+++ b/ci/data_extractor/src/formatters/hlapi/hlapi.py
@@ -39,9 +39,9 @@ class HlApiFormatter(GenericFormatter):
 TRANSFER_IMPLEM_COLUMN_HEADER = "Transfer implementation"


-class Erc20Formatter(HlApiFormatter):
+class Erc7984Formatter(HlApiFormatter):
    """
-    Formatter for ERC20 benchmarks.
+    Formatter for ERC7984 benchmarks.
    """

    @staticmethod
@@ -63,7 +63,7 @@ class Erc20Formatter(HlApiFormatter):
                bench_type = BenchType.Latency
                conversion_func = utils.convert_latency_value_to_readable_text

-            # For now ERC20 benchmarks are only made on 64-bit ciphertexts.
+            # For now ERC7984 benchmarks are only made on 64-bit ciphertexts.
            value = conversion_func(timings[-1])
            formatted[test_name][bench_type] = value

--- a/ci/regression.toml
+++ b/ci/regression.toml
@@ -66,7 +66,7 @@ target.hlapi-dex = [
    "swap_claim::whitepaper",
    "swap_claim::no_cmux"
 ]
-target.hlapi-erc20 = ["transfer::whitepaper", "transfer::no_cmux"]
+target.hlapi-erc7984 = ["transfer::whitepaper", "transfer::no_cmux"]
 target.core_crypto-ks = ["keyswitch"]
 target.core_crypto-pbs = ["multi_bit_pbs"]
 parameters_filter = "PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128"
@@ -99,7 +99,7 @@ target.hlapi-dex = [
    "swap_claim::whitepaper",
    "swap_claim::no_cmux"
 ]
-target.hlapi-erc20 = ["transfer::whitepaper", "transfer::no_cmux"]
+target.hlapi-erc7984 = ["transfer::whitepaper", "transfer::no_cmux"]
 target.shortint = ["bitand"]
 target.core_crypto-ks = ["keyswitch"]
 target.core_crypto-pbs = [ "pbs_mem_optimized"]
--- a/ci/webdriver.py
+++ b/ci/webdriver.py
@@ -85,6 +85,11 @@ parser.add_argument(
    dest="value_filter_pattern",
    help="Pattern to use to filter HTML button value displayed on web page",
 )
+parser.add_argument(
+    "--id-exclude-pattern",
+    dest="id_exclude_pattern",
+    help="Pattern to use to exclude HTML button IDs from the filtered set",
+)
 parser.add_argument(
    "-f",
    "--fail-fast",
@@ -142,11 +147,14 @@ class Driver:
                        "Script is running as root, running browser with --no-sandbox for compatibility"
                    )
                self.options.add_argument("--no-sandbox")
+                self.options.add_argument("--headless=new")
+                # Needed for wasm-par-mq sync executor mode
+                self.options.add_argument("--enable-features=ServiceWorker")
            case BrowserKind.firefox:
                self.options = FirefoxOptions()
+                self.options.add_argument("-headless")

        self.options.binary_location = self.browser_path
-        self.options.add_argument("--headless")

        self._driver = None

@@ -279,7 +287,14 @@ class Cases:
        self._cases.append(use_case)

    def _filter(self, field, pattern):
-        return [case for case in self._cases if pattern in getattr(case, field)]
+        result = Cases()
+        result._cases = [case for case in self._cases if pattern in getattr(case, field)]
+        return result
+
+    def _exclude(self, field, pattern):
+        result = Cases()
+        result._cases = [case for case in self._cases if pattern not in getattr(case, field)]
+        return result

    def filter_by_id(self, pattern):
        """
@@ -287,7 +302,7 @@ class Cases:

        :param pattern: :class:`str` that would be included in `id`

-        :return: :class:`list` comprehension of :class:`UseCase`
+        :return: :class:`Cases` containing matching use cases
        """
        return self._filter("id", pattern)

@@ -297,10 +312,20 @@ class Cases:

        :param pattern: :class:`str` that would be included in `value`

-        :return: :class:`list` comprehension of :class:`UseCase`
+        :return: :class:`Cases` containing matching use cases
        """
        return self._filter("value", pattern)

+    def exclude_by_id(self, pattern):
+        """
+        Exclude use cases whose HTML `id` attribute contains the pattern.
+
+        :param pattern: :class:`str` that would be excluded from `id`
+
+        :return: :class:`Cases` with matching use cases removed
+        """
+        return self._exclude("id", pattern)
+

 def parse_html_index(filepath):
    """
@@ -471,6 +496,8 @@ def main():
        cases = cases.filter_by_id(args.id_filter_pattern)
    elif args.value_filter_pattern:
        cases = cases.filter_by_value(args.value_filter_pattern)
+    if args.id_exclude_pattern:
+        cases = cases.exclude_by_id(args.id_exclude_pattern)

    server_process = None
    if args.server_cmd:
--- a/mockups/tfhe-hpu-mockup/LICENSE
+++ b/mockups/tfhe-hpu-mockup/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/mockups/tfhe-hpu-mockup/README.md
+++ b/mockups/tfhe-hpu-mockup/README.md
@@ -164,7 +164,7 @@ cargo run --release --features="hpu" --test hpu -- u8
 # * bitwise: for ct x ct bitwise IOps
 # * cmp: for comparison IOps
 # * ternary: for if-then-else and like IOps
-# * algo: for IOps dedicated to offload sub-algorithm like ERC_20
+# * algo: for IOps dedicated to offload sub-algorithm like ERC_7984
 # Command below only run comparison IOps, for convenience, `just hpu_test "cmp"` could be also used
 cargo run --release --features="hpu" --test hpu -- cmp
 ```
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
J-B Orfila	23c9bc8f92	chores(docs): minor corrections to the doc	2026-04-23 10:46:35 +02:00
Nicolas Sarlin	4790f8ba1c	fix(bench): wrong size in wasm benchmarks	2026-04-14 11:17:11 +02:00
dependabot[bot]	79a54df25b	chore(deps): bump docker/login-action from 4.0.0 to 4.1.0 Bumps [docker/login-action](https://github.com/docker/login-action) from 4.0.0 to 4.1.0. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](`b45d80f862...4907a6ddec`) --- updated-dependencies: - dependency-name: docker/login-action dependency-version: 4.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-14 10:32:05 +02:00
Theo Souchon	50d6be121a	chore(test): refacto around noise check test and json output	2026-04-14 09:58:20 +02:00
Nicolas Sarlin	7cd966d8a7	chore: allow rand audit advisory	2026-04-14 08:54:05 +02:00
David Testé	6ca929051d	chore(ci): remove permanent instance fallback for gpu These fallback were set to mitigate Hyperstack resource shortages. Those instances are not used anymore and workflows are modified to avoid having a workflow run stuck because it waits for a permanent runner that doesn't exist.	2026-04-10 14:30:32 +02:00
Pedro Alves	871cc8f772	chore(docs): rewrite GPU ZK-PoK documentation for zk-cuda-backend integration	2026-04-10 08:40:08 -03:00
Theo Souchon	b938473788	chore: renamed erc20 to erc7984	2026-04-10 09:18:51 +02:00
Nicolas Sarlin	74869f5e2f	chore(integer): refactor expansion helper - Split the pure expand and the post processing (cast, unpack, sanitize) - Add a new internal intermediate type: ExpandedCiphertextList - verify_and_expand just calls verify+expand	2026-04-09 11:07:03 +02:00
dependabot[bot]	326dd6a5c7	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 5.0.1 to 5.0.4. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`70c4af2ed5...ca46236c6c`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-version: 5.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-08 09:42:31 +02:00
Guillermo Oyarzun	1abc69751a	feat(gpu): create noise and pfail tests for rerand	2026-04-07 20:33:31 +02:00
Pedro Alves	3c2cb273d5	chore(docs): add GPU ZK benchmark SVG generation to CI pipeline Add GPU ZK benchmark and SVG generation jobs to the documentation workflows, and fix the data extractor to handle the cuda::zk:: prefix chain in GPU ZK benchmark names.	2026-04-07 05:02:04 -03:00
dependabot[bot]	b18060e5c8	chore(deps): bump codecov/codecov-action from 5.5.2 to 6.0.0 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.5.2 to 6.0.0. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`671740ac38...57e3a136b7`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-version: 6.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-07 09:51:30 +02:00
dependabot[bot]	c8827a21a7	chore(deps): bump rust-lang/crates-io-auth-action from 1.0.3 to 1.0.4 Bumps [rust-lang/crates-io-auth-action](https://github.com/rust-lang/crates-io-auth-action) from 1.0.3 to 1.0.4. - [Release notes](https://github.com/rust-lang/crates-io-auth-action/releases) - [Commits](`b7e9a28ede...bbd81622f2`) --- updated-dependencies: - dependency-name: rust-lang/crates-io-auth-action dependency-version: 1.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-07 09:51:08 +02:00
Andrei Stoian	a7476d0aaa	chore(gpu): update benchmarks GPU fix(gpu): pbs benches fix(gpu): pbs benches	2026-04-03 11:29:51 +02:00
David Testé	10d104e500	chore: update copyright year to 2026	2026-04-03 10:22:13 +02:00
David Testé	dbb1f151c8	chore(ci): add release workflow for zk-cuda-backend	2026-04-01 11:14:31 +02:00
David Testé	9cb8ad9bff	chore(ci): create common cuda release workflow This refactorting is done to add zk-cuda-backend crate release without duplicating the logic in the new workflow.	2026-04-01 11:14:31 +02:00
David Testé	d970210ae4	chore(ci): update slab-github-runner action to v1.6.0 This action version now uses node24 as runner since node20 support is dropped on April 2026.	2026-04-01 09:47:44 +02:00
David Palm	5236c21733	chore: Move safe-serialization to own crate and wire it up with the workspace.	2026-03-31 16:30:40 +02:00
Guillermo Oyarzun	7598725c7e	feat(gpu): add pbs128 pattern to multi-bit noise test	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	f0cff6176d	feat(gpu): add cpk ks ms pattern to multi-bit noise tests	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	8bb38d4e70	feat(gpu): add packing ks multi-bit noise tests	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	35fe71cc07	feat(gpu): add br_dp_ks_ms pattern to multi-bit noise tests	2026-03-31 14:30:01 +02:00
Nicolas Sarlin	62429da859	chore(ci): publish tfhe-compat js package	2026-03-31 13:40:08 +02:00
dependabot[bot]	8a4b3c35f4	chore(deps): bump actions/cache from 5.0.3 to 5.0.4 Bumps [actions/cache](https://github.com/actions/cache) from 5.0.3 to 5.0.4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](`cdf6c1fa76...668228422a`) --- updated-dependencies: - dependency-name: actions/cache dependency-version: 5.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-03-30 14:31:27 +02:00
Theo Souchon	641fec028f	chore(lint): add message for the backward compat report when everything is ok	2026-03-30 13:25:58 +02:00
Nicolas Sarlin	8d8379409b	chore(shortint): remove 'parallel-wasm-api' feature gating This used to be required but now rayon handles this gracefully and runs the code sequentially when threads are not available	2026-03-30 11:58:52 +02:00
Nicolas Sarlin	d547e67f66	refactor(hl): factorize hl proven ct list expand code	2026-03-30 11:58:40 +02:00
Arthur Meyre	4cf03c063d	chore: update Cargo.locks for generate crates	2026-03-30 11:02:17 +02:00
Thomas Montaigu	9372c761dd	chore: only check typos on tracked files I have a bunch of files in the repo that are not tracked and not meant to be. the typos command checks them this prevents me from running fpcc/pcc locally Advantages: check all tracked files (it seems like the .github is ignored) Drawback: filtering is part of the git ls-files command not the _typos.toml	2026-03-27 12:15:14 +01:00
Thomas Montaigu	d9dec879e7	chore: only check_tfhe_docs_are_tested on trackedfiles Checking more than the .md tracked is excessive Also change the print of the missing files to be more human readable	2026-03-27 12:12:53 +01:00
Thomas Montaigu	6a0fb21fd0	chore: make check_fmt_toml exit with non-zero code The check_fmt_toml recipe was just doing an echo with an help message on taplo error, but this also 'ate' the non zero exit code which chains of recipe (make pcc, fpcc) and ci would not stop on the error, makint this check not very useful	2026-03-27 11:08:41 +01:00
Andrei Stoian	95058c9b00	fix(gpu): compile	2026-03-27 10:43:28 +01:00
Andrei Stoian	e19c5826c0	fix(gpu): compile	2026-03-27 10:43:28 +01:00
Andrei Stoian	adf27ab700	fix(gpu): more conformance fixes	2026-03-27 10:43:28 +01:00
Pedro Alves	32c6db381f	fix(gpu): adapt benchmarks to benchmark_spec API and link helper_profile in CUDA tests	2026-03-26 15:37:02 -03:00
Pedro Alves	d79801b340	fix(gpu): bump tfhe-cuda-backend dependency to 0.14.0, remove zkv1 GPU code path, and clean up zk-cuda-backend API - Bump tfhe-cuda-backend dependency from 0.13.0 to 0.14.0 - Remove deprecated zkv1 GPU code path (prove now unconditionally uses CPU) - Remove `Option` wrapper from `gpu_index` parameter — callers always pass a concrete `u32`, so the indirection added no value - Fix compilation warnings in zk-cuda-backend	2026-03-26 15:37:02 -03:00
Pedro Alves	0a4e4cf9e2	feat(gpu): add PTX carry-chain CIOS Montgomery multiply for Fp, and add PTX carry chains for fp_add/sub and branchless reduction -replace software carry detection (carry = (sum < old) ? 1 : 0) with inline PTX hardware carry flags (add.cc.u64/addc.u64) - replace software carry detection in fp_add_raw/fp_sub_raw with inline PTX add.cc.u64/addc.cc.u64 and sub.cc.u64/subc.cc.u64 chains\ - now we always compute both reduced and unreduced result and select via bitmask	2026-03-26 15:37:02 -03:00
Pedro Alves	5d6b3146b1	feat(gpu): integrate zk-cuda-backend with tfhe-zk-pok	2026-03-26 15:37:02 -03:00
Theo Souchon	e4ea44c571	chore(lint): add crate context for backward compat error message	2026-03-26 15:46:21 +01:00
Andrei Stoian	de331f322a	fix(gpu): allow minor and patch differences in cuda backend	2026-03-26 15:01:56 +01:00
Nicolas Sarlin	844c345e18	fix(zk): wrong hash of new types in snapshot	2026-03-26 14:30:22 +01:00
Nicolas Sarlin	a9520f8930	docs(wasm): document cross origin mode for zk proofs	2026-03-26 13:23:29 +01:00
Nicolas Sarlin	98a9baf7a8	perf(zk): use wnaf for g2 msm in wasm	2026-03-26 13:23:29 +01:00
Nicolas Sarlin	c621c1fc77	perf(zk): use wasm-par-mq to compute msm in proof	2026-03-26 13:23:29 +01:00
Nicolas Sarlin	41fffb0306	chore: rename unsafe_coop to cross_origin	2026-03-26 13:23:29 +01:00