WIP: run gpu documentation benchmarks on scaleway

chore(ci): add terraform script for gpu benchmarks
This would spawn a H100-SXM-8-80G on Scaleway platform.
2026-04-28 03:01:21 -04:00 · 2026-03-26 15:12:55 +01:00 · 2026-03-25 15:28:42 +01:00
338 changed files with 5357 additions and 26666 deletions
--- a/.cargo/audit.toml
+++ b/.cargo/audit.toml
@@ -4,9 +4,6 @@ ignore = [
    "RUSTSEC-2024-0436",
    # Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
    "RUSTSEC-2025-0141",
-    # Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
-    # arkworks and we do not use custom loggers.
-    "RUSTSEC-2026-0097",
 ]

 [output]
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -82,10 +82,11 @@ runs:
        sudo apt update
        sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"

+# Command to put back in once nvcc check is fixed
+#        find /usr/local -executable -name "nvcc"
    - name: Export CUDA variables
      shell: bash
      run: |
-        find /usr/local -executable -name "nvcc"
        CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
        {
          echo "CUDA_PATH=$CUDA_PATH";
--- a/.github/workflows/aws_data_tests.yml
+++ b/.github/workflows/aws_data_tests.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -16,6 +16,7 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

+
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
@@ -36,7 +37,6 @@ jobs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
-      safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: "false"
+          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -79,7 +79,6 @@ jobs:
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
-              - utils/tfhe-safe-serialize/**
            csprng:
              - tfhe-csprng/**
            zk_pok:
@@ -87,8 +86,6 @@ jobs:
            versionable:
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
-            safe_serialize:
-              - utils/tfhe-safe-serialize/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -125,7 +122,6 @@ jobs:
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
          steps.changed-files.outputs.versionable_any_changed == 'true' ||
-          steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -149,7 +145,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: "false"
+          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -174,11 +170,6 @@ jobs:
        run: |
          make test_versionable

-      - name: Run tfhe-safe-serialize tests
-        if: needs.should-run.outputs.safe_serialize_test == 'true'
-        run: |
-          make test_safe_serialize
-
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -200,7 +191,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            ~/.nvm
@@ -213,7 +204,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -34,7 +34,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -14,11 +14,12 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

+
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [labeled]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -31,16 +32,16 @@ jobs:
    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read # Needed to check for file change
+      pull-requests: read  # Needed to check for file change
    outputs:
      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
-        steps.changed-files.outputs.wasm_any_changed }}
+          steps.changed-files.outputs.wasm_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: "false"
+          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -62,7 +63,6 @@ jobs:
                - tfhe/js_on_wasm_tests/**
                - tfhe/web_wasm_parallel_tests/**
                - utils/tfhe-versionable/**
-                - utils/tfhe-safe-serialize/**
                - .github/workflows/aws_tfhe_wasm_tests.yml

  wasm-tests:
@@ -78,7 +78,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: "false"
+          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -92,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            ~/.nvm
@@ -105,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -128,21 +128,15 @@ jobs:
        run: |
          make test_nodejs_wasm_api_ci

+      - name: Run parallel wasm tests
+        run: |
+          make test_web_js_api_parallel_chrome_ci
+
      - name: Run wasm_par_mq tests
        run: |
          make test_wasm_par_mq_chrome_ci
          make test_wasm_par_mq_firefox_ci

-      - name: Run parallel wasm tests
-        run: |
-          make test_web_js_api_parallel_chrome_ci
-          make test_web_js_api_parallel_firefox_ci
-
-      - name: Run cross origin wasm tests
-        run: |
-          make test_web_js_api_cross_origin_chrome_ci
-          make test_web_js_api_cross_origin_firefox_ci
-
      - name: Run x86_64/wasm zk compatibility tests
        run: |
          make test_zk_wasm_x86_compat_ci
--- a/.github/workflows/backward_compat_pr_change_report.yml
+++ b/.github/workflows/backward_compat_pr_change_report.yml
@@ -6,9 +6,6 @@ name: backward_compat_pr_change_report
 on:
  pull_request:

-env:
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
 permissions:
  contents: read

@@ -17,35 +14,9 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  should-run:
-    name: backward_compat_pr_change_report/should-run
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
-    outputs:
-      backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
-        with:
-          files_yaml: |
-            backward:
-              - utils/tfhe-lints/snapshots/*.json
-
  change-report:
    name: backward_compat_pr_change_report/change-report (bpr)
    runs-on: ubuntu-latest
-    needs: should-run
-    if:
-      needs.should-run.outputs.backward_report == 'true'
    permissions:
      pull-requests: write # To send and modify message in the PR
    steps:
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -14,12 +14,11 @@ on:
          - signed_integer
          - integer_compression
          - integer_zk
-          - msm_zk
          - shortint
          - shortint_oprf
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc7984
+          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
          - hlapi_kvstore
@@ -93,8 +92,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc7984":
-            files_to_parse.append("erc7984_pbs_count.csv")
+          elif inputs_command == "hlapi_erc20":
+            files_to_parse.append("erc20_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -107,7 +107,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set matrix arguments outputs
+      - name: Set martix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -108,14 +108,14 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-hlapi-erc7984:
-    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
+  run-benchmarks-hlapi-erc20:
+    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
-      command: hlapi_erc7984
-      additional_file_to_parse: erc7984_pbs_count.csv
+      command: hlapi_erc20
+      additional_file_to_parse: erc20_pbs_count.csv
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -17,10 +17,6 @@ on:
        description: "Run GPU core-crypto benchmarks"
        type: boolean
        default: true
-      run-gpu-zk-benchmarks:
-        description: "Run GPU ZK benchmarks"
-        type: boolean
-        default: true
      run-hpu-benchmarks:
        description: "Run HPU benchmarks"
        type: boolean
@@ -40,7 +36,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc7984
+      command: integer,hlapi_erc20
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -93,9 +89,10 @@ jobs:
    uses: ./.github/workflows/benchmark_gpu_common.yml
    if: inputs.run-gpu-integer-benchmarks
    with:
-      profile: multi-h100-sxm5
+      backend: terraform
+      profile: scaleway-multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc7984
+      command: integer_multi_bit,hlapi_erc20
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -107,14 +104,14 @@ jobs:
      JOB_SECRET: ${{ secrets.JOB_SECRET }}
      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}

  run-benchmarks-hpu-integer:
    name: benchmark_documentation/run-benchmarks-hpu-integer
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc7984
+      command: integer,hlapi_erc20
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -154,7 +151,8 @@ jobs:
    uses: ./.github/workflows/benchmark_gpu_common.yml
    if: inputs.run-gpu-core-crypto-benchmarks
    with:
-      profile: multi-h100-sxm5
+      backend: terraform
+      profile: scaleway-multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
      command: pbs, ks_pbs
      bench_type: latency
@@ -167,44 +165,23 @@ jobs:
      JOB_SECRET: ${{ secrets.JOB_SECRET }}
      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-gpu-zk-server:
-    name: benchmark_documentation/run-benchmarks-gpu-zk-server
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-zk-benchmarks
-    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100-SXM5x8
-      command: integer_zk
-      op_flavor: default
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}

  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
-      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
-      run-benchmarks-gpu-zk-server
+      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
    ]
    uses: ./.github/workflows/generate_svgs.yml
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -213,7 +190,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,13 +31,10 @@ on:
          - pbs128
          - ks
          - ks_pbs
-          - tfhe_zk_pok
-          - msm_zk
          - integer_zk
-          - integer_zk_experimental
          - integer_aes
          - integer_aes256
-          - hlapi_erc7984
+          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
      op_flavor:
@@ -123,8 +120,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc7984":
-            files_to_parse.append("erc7984_pbs_count.csv")
+          elif inputs_command == "hlapi_erc20":
+            files_to_parse.append("erc20_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -111,7 +111,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set matrix arguments outputs
+      - name: Set martix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -126,11 +126,17 @@ jobs:
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -139,6 +145,25 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

+      - name: Acknowledge remote instance failure
+        if: steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile != 'single-h100'
+        run: |
+          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
+          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
+          exit 1
+        env:
+          INPUTS_PROFILE: ${{ inputs.profile }}
+
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' &&
+          steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile == 'single-h100'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
  # Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
  install-dependencies:
    name: benchmark_gpu_common/install-dependencies
@@ -159,6 +184,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -307,13 +333,13 @@ jobs:

  teardown-instance:
    name: benchmark_gpu_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -42,7 +42,7 @@ env:
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
  SCHEDULING_POLICY: "MAX_PARALLELISM"
-  BENCHMARKS: "erc7984"
+  BENCHMARKS: "erc20"
  BRANCH_NAME: ${{ github.ref_name }}
  COMMIT_SHA: ${{ github.sha }}
  SLAB_SECRET: ${{ secrets.JOB_SECRET }}
@@ -77,7 +77,7 @@ jobs:
          if [[ ${IS_MANUAL_RUN} == true ]]; then
            PROFILE_RAW="${PROFILE_MANUAL_RUN}"
          else
-            PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
+            PROFILE_RAW="${PROFILE}"
          fi
          # shellcheck disable=SC2001
          PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
@@ -94,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
        with:
          path: |
            ~/.cargo/registry
@@ -214,14 +214,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -248,13 +248,13 @@ jobs:
          npm install && npm run deploy:emptyProxies && npx hardhat compile
        working-directory: fhevm/

-      - name: Profile erc7984 no-cmux benchmark on GPU
+      - name: Profile erc20 no-cmux benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
          BENCHMARK_TYPE="THROUGHPUT_200" \
          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
-          make -e "profile_erc7984_gpu"
+          make -e "profile_erc20_gpu"
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
@@ -333,7 +333,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -14,7 +14,7 @@ on:
          - integer
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc7984
+          - hlapi_erc20
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -95,7 +95,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set matrix arguments outputs
+      - name: Set martix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -387,7 +387,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -63,7 +63,7 @@ jobs:
          with open(env_file, "a") as f:
            f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")

-      - name: Set matrix arguments output
+      - name: Set martix arguments output
        id: set_matrix_arg
        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
@@ -77,7 +77,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            ~/.nvm
@@ -137,7 +137,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -158,9 +158,9 @@ jobs:
        env:
          BROWSER: ${{ matrix.browser }}

-      - name: Run benchmarks (cross origin)
+      - name: Run benchmarks (unsafe coop)
        run: |
-          make bench_web_js_api_cross_origin_"${BROWSER}"_ci
+          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
        env:
          BROWSER: ${{ matrix.browser }}

@@ -218,7 +218,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -94,7 +94,7 @@ jobs:
          with open(env_file, "a") as f:
            f.write(f"""RUNNERS=["{'", "'.join(runners)}"]\n""")

-      - name: Set matrix runners outputs
+      - name: Set martix runners outputs
        id: set_matrix_runners
        run: | # zizmor: ignore[template-injection] these env variable are safe
          echo "runners=${{ toJSON(env.RUNNERS) }}" >> "${GITHUB_OUTPUT}"
@@ -138,7 +138,7 @@ jobs:
      - name: Node cache restoration
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
        id: node-cache
-        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: |
            ~/.nvm
@@ -151,7 +151,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -50,7 +50,7 @@ jobs:
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -74,7 +74,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
+        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -88,7 +88,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
+        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -209,98 +209,60 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  gpu-zk-server-latency-table:
-    name: generate_documentation_svgs/gpu-zk-server-latency-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-gpu-svgs
-    with:
-      backend: gpu
-      hardware_name: n3-H100-SXM5x8
-      layer: integer
-      bench_subset: zk
-      pbs_kind: multi_bit
-      grouping_factor: 4
-      bench_type: latency
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-zk-benchmark-latency
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  gpu-zk-server-throughput-table:
-    name: generate_documentation_svgs/gpu-zk-server-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-gpu-svgs
-    with:
-      backend: gpu
-      hardware_name: n3-H100-SXM5x8
-      layer: integer
-      bench_subset: zk
-      pbs_kind: multi_bit
-      grouping_factor: 4
-      bench_type: throughput
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-zk-benchmark-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
  # -----------------------------------------------------------
-  # ERC7984 benchmarks tables
+  # ERC20 benchmarks tables
  # -----------------------------------------------------------

-  cpu-erc7984-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
+  cpu-erc20-latency-throughput-table:
+    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
      backend: cpu
      hardware_name: hpc7a.96xlarge
      layer: hlapi
-      bench_subset: erc7984
+      bench_subset: erc20
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
+      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  gpu-erc7984-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
+  gpu-erc20-latency-throughput-table:
+    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-gpu-svgs
    with:
      backend: gpu
      hardware_name: n3-H100-SXM5x8
      layer: hlapi
-      bench_subset: erc7984
+      bench_subset: erc20
      pbs_kind: multi_bit
      grouping_factor: 4
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
+      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  hpu-erc7984-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
+  hpu-erc20-latency-throughput-table:
+    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-hpu-svgs
    with:
      backend: hpu
      hardware_name: hpu_x1
      layer: hlapi
-      bench_subset: erc7984
+      bench_subset: erc20
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
+      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,24 +62,29 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -89,6 +93,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -121,6 +132,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -164,14 +176,14 @@ jobs:

  teardown-instance:
    name: gpu_core_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -77,7 +77,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,11 +25,17 @@ jobs:
    name: gpu_full_h100_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -38,6 +44,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
  cuda-tests-linux:
    name: gpu_full_h100_tests/cuda-tests-linux
    needs: [ setup-instance ]
@@ -61,6 +74,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -104,13 +118,13 @@ jobs:

  teardown-instance:
    name: gpu_full_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +186,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -66,23 +65,27 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,6 +94,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -123,6 +133,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -173,14 +184,14 @@ jobs:

  teardown-instance:
    name: gpu_hlapi_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -17,8 +17,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Weekly tests will be triggered every Monday at 8p.m.
-    - cron: "0 20 * * 1"
+    # Nightly tests will be triggered each evening 8p.m.
+    - cron: "0 20 * * *"
  pull_request:


@@ -28,48 +28,17 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  should-run:
-    name: gpu_integer_long_run_tests/should-run
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
-    outputs:
-      is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
-        with:
-          files_yaml: |
-            gpu:
-              - tfhe/Cargo.toml
-              - tfhe/build.rs
-              - backends/tfhe-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
-              - '.github/workflows/gpu_integer_long_run_tests.yml'
-
  setup-instance:
    name: gpu_integer_long_run_tests/setup-instance
-    needs: [should-run]
-    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -143,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -131,10 +131,6 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          make semgrep_and_lint_gpu_code
-
      - name: Run fmt checks
        run: |
          make check_fmt_gpu
@@ -143,6 +139,10 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          make semgrep_and_lint_gpu_code
+
      - name: Run semver checks on tfhe-cuda-backend
        run: |
          make semver_check_cuda_backend
@@ -176,7 +176,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -63,6 +63,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -79,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +169,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,25 +63,30 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,6 +95,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -123,6 +134,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -164,14 +176,14 @@ jobs:

  teardown-instance:
    name: gpu_signed_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -64,6 +64,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -177,7 +178,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -63,6 +63,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -79,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +169,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,25 +63,30 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,6 +95,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -123,6 +134,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -164,14 +176,14 @@ jobs:

  teardown-instance:
    name: gpu_unsigned_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -64,6 +64,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -177,7 +178,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -51,13 +51,7 @@ jobs:
        with:
          files_yaml: |
            gpu:
-              - tfhe/Cargo.toml
-              - tfhe/build.rs
-              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
-              - tfhe/src/shortint/parameters/**
-              - tfhe/src/zk/**
-              - tfhe-zk-pok/**
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

@@ -73,7 +67,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -132,9 +126,6 @@ jobs:
      - name: Run zk-cuda-backend integration tests
        run: |
          make test_zk_cuda_backend
-          make test_zk_pok_experimental_gpu
-          make test_integer_zk_gpu
-          make test_integer_zk_experimental_gpu

  slack-notify:
    name: gpu_zk_tests/slack-notify
@@ -167,7 +158,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -107,7 +107,7 @@ jobs:
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
+        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
        id: auth

      - name: Publish crate.io package
--- a/.github/workflows/make_release_common_cuda.yml
+++ b/.github/workflows/make_release_common_cuda.yml
@@ -1,36 +1,12 @@
-# Common workflow to make crate release for CUDA backend
-name: make_release_common_cuda
+name: make_release_cuda

 on:
-  workflow_call:
+  workflow_dispatch:
    inputs:
-      package-name:
-        type: string
-        required: true
-      dry-run:
+      dry_run:
+        description: "Dry-run"
        type: boolean
        default: true
-    secrets:
-      REPO_CHECKOUT_TOKEN:
-        required: true
-      SLAB_ACTION_TOKEN:
-        required: true
-      SLAB_BASE_URL:
-        required: true
-      SLAB_URL:
-        required: true
-      JOB_SECRET:
-        required: true
-      SLACK_CHANNEL:
-        required: true
-      BOT_USERNAME:
-        required: true
-      SLACK_WEBHOOK:
-        required: true
-      ALLOWED_TEAM:
-        required: true
-      READ_ORG_TOKEN:
-        required: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -45,15 +21,15 @@ permissions: {}

 jobs:
  verify-triggering-actor:
-    name: make_release_common_cuda/verify-triggering-actor
+    name: make_release_cuda/verify-triggering-actor
    if: startsWith(github.ref, 'refs/tags/')
    uses: ./.github/workflows/verify_triggering_actor.yml
    secrets:
-      ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  setup-instance:
-    name: make_release_common_cuda/setup-instance
+    name: make_release_cuda/setup-instance
    needs: verify-triggering-actor
    runs-on: ubuntu-latest
    outputs:
@@ -61,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -71,7 +47,7 @@ jobs:
          profile: gpu-build

  package:
-    name: make_release_common_cuda/package
+    name: make_release_cuda/package
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    outputs:
@@ -100,6 +76,7 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -112,6 +89,7 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -123,14 +101,12 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Prepare package
-        env:
-          PACKAGE: ${{ inputs.package-name }}
        run: |
-          cargo package -p "${PACKAGE}"
+          cargo package -p tfhe-cuda-backend

      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
-          name: crate-${{ inputs.package-name }}
+          name: crate-tfhe-cuda-backend
          path: target/package/*.crate

      - name: generate hash
@@ -138,8 +114,8 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
-    name: make_release_common_cuda/provenance
-    if: ${{ !inputs.dry-run  }}
+    name: make_release_cuda/provenance
+    if: ${{ !inputs.dry_run  }}
    needs: [package]
    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
@@ -152,7 +128,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish-cuda-release:
-    name: make_release_common_cuda/publish-cuda-release
+    name: make_release_cuda/publish-cuda-release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    permissions:
@@ -174,6 +150,7 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -186,6 +163,7 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -199,23 +177,22 @@ jobs:
      - name: Download artifact
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
-          name: crate-${{ inputs.package-name }}
+          name: crate-tfhe-cuda-backend
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
+        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
        id: auth

      - name: Publish crate.io package
        env:
          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
-          PACKAGE: ${{ inputs.package-name }}
-          DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
-          # would fail. This is safe since dry-run is handled in the env section above.
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p "${PACKAGE}" ${DRY-RUN}
+          cargo publish -p tfhe-cuda-backend ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -227,7 +204,7 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -235,17 +212,17 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: make_release_common_cuda/teardown-instance
+    name: make_release_cuda/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, publish-cuda-release]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -258,4 +235,4 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -16,10 +16,6 @@ on:
        description: "Push web js package"
        type: boolean
        default: true
-      push_web_compat_package:
-        description: "Push web compat (cross-origin) js package"
-        type: boolean
-        default: true
      push_node_package:
        description: "Push node js package"
        type: boolean
@@ -103,23 +99,6 @@ jobs:
          tag: ${{ env.NPM_TAG }}
          provenance: true

-      - name: Build web compat (cross-origin) package
-        if: ${{ inputs.push_web_compat_package }}
-        run: |
-          rm -rf tfhe/pkg
-
-          make build_web_js_api
-          sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
-
-      - name: Publish web compat (cross-origin) package
-        if: ${{ inputs.push_web_compat_package }}
-        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
-        with:
-          package: tfhe/pkg/package.json
-          dry-run: ${{ inputs.dry_run }}
-          tag: ${{ env.NPM_TAG }}
-          provenance: true
-
      - name: Build Node package
        if: ${{ inputs.push_node_package }}
        run: |
--- a/.github/workflows/make_release_tfhe_cuda.yml
+++ b/.github/workflows/make_release_tfhe_cuda.yml
@@ -1,44 +0,0 @@
-# Publish new release of tfhe-rs CUDA backend on crates.io.
-name: make_release_tfhe_cuda
-
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true
-
-env:
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-
-jobs:
-  make-release:
-    name: make_release_tfhe_cuda/make-release
-    uses: ./.github/workflows/make_release_common_cuda.yml
-    with:
-      package-name: "tfhe-cuda-backend"
-      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      actions: read # Needed to detect the GitHub Actions environment
-      id-token: write # Needed to create the provenance via GitHub OIDC
-      contents: write # Needed to upload assets/artifacts
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
-      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/make_release_tfhe_safe_serialize.yml
+++ b/.github/workflows/make_release_tfhe_safe_serialize.yml
@@ -1,32 +0,0 @@
-name: make_release_tfhe_safe_serialize
-
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-
-jobs:
-  make-release:
-    name: make_release_tfhe_safe_serialize/make-release
-    uses: ./.github/workflows/make_release_common.yml
-    with:
-      package-name: "tfhe-safe-serialize"
-      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      actions: read # Needed to detect the GitHub Actions environment
-      id-token: write # Needed to create the provenance via GitHub OIDC
-      contents: write # Needed to upload assets/artifacts
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
-      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
--- a/.github/workflows/make_release_zk_cuda.yml
+++ b/.github/workflows/make_release_zk_cuda.yml
@@ -1,44 +0,0 @@
-# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
-name: make_release_zk_cuda
-
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true
-
-env:
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-
-jobs:
-  make-release:
-    name: make_release_zk_cuda/make-release
-    uses: ./.github/workflows/make_release_common_cuda.yml
-    with:
-      package-name: "zk-cuda-backend"
-      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      actions: read # Needed to detect the GitHub Actions environment
-      id-token: write # Needed to create the provenance via GitHub OIDC
-      contents: write # Needed to upload assets/artifacts
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
-      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Restore Sagemath image from cache
        id: docker-cache
-        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
      - name: Store Sagemath image in cache
        if: steps.docker-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
-backends/tfhe-cuda-backend/cuda/build/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
@@ -35,9 +34,6 @@ node_modules/
 package-lock.json
 utils/wasm-par-mq/examples/*/pkg/

-# Commit lock files of backward data generation crates
-!utils/tfhe-backward-compat-data/crates/generate_*/Cargo.lock
-
 # Python .env
 .env
 __pycache__
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,12 +14,10 @@ members = [
    "tfhe-fft",
    "tfhe-ntt",
    "tfhe-zk-pok",
-    "utils/benchmark_spec",
    "utils/param_dedup",
    "utils/tfhe-backward-compat-checker",
    "utils/tfhe-backward-compat-data",
    "utils/tfhe-backward-compat-data/crates/add_new_version",
-    "utils/tfhe-safe-serialize",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/wasm-par-mq",
@@ -45,7 +43,6 @@ rand = "0.8"
 rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = { version = "0.2.114" }
-wasm-bindgen-futures = { version = "0.4.56" }
 # js-sys (at this point in time) automatically enables the unsafe-eval feature which we do not want
 # this does not prevent other deps from enabling it, but it at least conveys our need to not have it
 # we still enable std, which was part of default before
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2026 ZAMA.
+Copyright © 2025 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/254
+++ b/254
@@ -122,12 +122,6 @@ install_build_wasm32_target:
 	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
 	Rustup can be downloaded at https://rustup.rs/" && exit 1 )

-.PHONY: install_check_wasm32_target # Install the wasm32 toolchain used for checks
-install_check_wasm32_target:
-	rustup target add wasm32-unknown-unknown --toolchain "$(RS_CHECK_TOOLCHAIN)" || \
-	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
-	Rustup can be downloaded at https://rustup.rs/" && exit 1 )
-
 .PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
 install_cargo_nextest:
 	@cargo nextest --version > /dev/null 2>&1 || \
@@ -312,7 +306,7 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
 	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
 		| grep -v '/cmake-build-debug/' \
 		| grep -v '/build/' \
-		| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
+		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
 	venv/bin/python3 "scripts/check_scratch_cleanup.py"

 .PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
@@ -356,23 +350,23 @@ check_fmt_js: check_nvm_installed
 .PHONY: check_fmt_toml # Check TOML files format
 check_fmt_toml: install_taplo
 	@RUST_LOG=warn taplo fmt --check || \
-	{ echo "TOML files format check failed. Please run 'make fmt_toml'"; exit 1; }
+	echo "TOML files format check failed. Please run 'make fmt_toml'"

 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
-	@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"
+	@typos && echo "No typos found"

 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p tfhe

@@ -386,7 +380,7 @@ clippy_hpu: install_rs_check_toolchain
 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

@@ -479,7 +473,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	fi && \
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
 		-p tfhe -- --nocapture

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -490,17 +484,11 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
-		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
 		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,parallel-wasm-api \
-		-p tfhe -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
@@ -541,15 +529,6 @@ clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings

-.PHONY: clippy_zk_pok_wasm # Run clippy lints on tfhe-zk-pok for wasm32 target
-clippy_zk_pok_wasm: install_rs_check_toolchain install_check_wasm32_target
-	RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--target wasm32-unknown-unknown \
-		-p tfhe-zk-pok -- --no-deps -D warnings
-	RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--target wasm32-unknown-unknown \
-		-p tfhe-zk-pok --features cross-origin-wasm -- --no-deps -D warnings
-
 .PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
 clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -557,11 +536,6 @@ clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-versionable -- --no-deps -D warnings

-.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
-clippy_safe_serialize: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-safe-serialize -- --no-deps -D warnings
-
 .PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
 clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -587,28 +561,15 @@ clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selec
 		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
 	fi

-.PHONY: check_backward_compat_locks_did_not_change # Check backward compat Cargo.lock files are up to date
-check_backward_compat_locks_did_not_change: install_rs_check_toolchain
-	@for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
-		echo "checking Cargo.lock for $$crate"; \
-		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-			-C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate metadata --locked --format-version 1 > /dev/null || \
-		( echo "Cargo.lock for $$crate is out of date. Update it with:" && \
-		  echo "  cd $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate && cargo metadata --format-version 1 > /dev/null" && \
-		  echo "then commit the updated Cargo.lock." && exit 1 ); \
-	done
-
 .PHONY: clippy_test_vectors # Run clippy lints on the test vectors app
 clippy_test_vectors: install_rs_check_toolchain
 	cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-test-vectors -- --no-deps -D warnings

-# WARNING: This target is not directly run in CI. When adding a subtarget here,
-# MAKE SURE TO ALSO ADD IT TO A PCC BATCH BELOW
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
-clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
+clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
 clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq

 .PHONY: clippy_fast # Run main clippy targets
@@ -705,7 +666,7 @@ build_c_api: install_rs_check_toolchain
 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
 		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -714,14 +675,11 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
 		-p tfhe

-.PHONY: build_web_js_api # Build the js API targeting the web browser, in sequential or cross origin parallelism modes.
+.PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api && \
-	find pkg/snippets -type f -iname worker_helpers.js -exec sed -i 's|import("../../..")|import("../../../tfhe.js")|g' {} \;
-	cp utils/wasm-par-mq/js/coordinator.js tfhe/pkg/
-	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 # parallel wasm requires specific build options, see https://github.com/rust-lang/rust/pull/147225
@@ -807,7 +765,7 @@ test_zk_cuda_backend:


 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu:
@@ -1243,31 +1201,12 @@ test_tfhe_csprng_big_endian: install_cargo_cross
 	RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu

+
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok --features experimental

-.PHONY: test_zk_pok_experimental_gpu # Run tfhe-zk-pok GPU-accelerated tests
-test_zk_pok_experimental_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
-
-.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
-test_integer_zk_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=integer,zk-pok,gpu -p tfhe -- \
-		integer::gpu::zk::
-
-.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
-test_integer_zk_experimental_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
-		integer::gpu::zk::
-
-.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
-test_zk_cuda: test_zk_cuda_backend test_zk_pok_experimental_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
-
 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
@@ -1286,11 +1225,6 @@ test_versionable:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		--all-targets -p tfhe-versionable

-.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
-test_safe_serialize:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--all-targets -p tfhe-safe-serialize
-
 # The backward compat data folder holds historical binary data but also rust code to generate and load them.
 .PHONY: gen_backward_compat_data # Re-generate backward compatibility data
 gen_backward_compat_data:
@@ -1425,19 +1359,6 @@ test_nodejs_wasm_api_ci: build_node_js_api

 # This is an internal target, not meant to be called on its own.
 run_web_js_api_parallel: build_web_js_api_parallel setup_venv
-	cd $(WEB_SERVER_DIR) && npm install && npm run build
-	source venv/bin/activate && \
-	python ci/webdriver.py \
-	--browser-path $(browser_path) \
-	--driver-path $(driver_path) \
-	--browser-kind  $(browser_kind) \
-	--server-cmd $(server_cmd) \
-	--server-workdir "$(WEB_SERVER_DIR)" \
-	--id-pattern $(filter) \
-	--id-exclude-pattern asyncMainThread
-
-# This is an internal target, not meant to be called on its own.
-run_web_js_api_cross_origin: build_web_js_api setup_venv
 	cd $(WEB_SERVER_DIR) && npm install && npm run build
 	source venv/bin/activate && \
 	python ci/webdriver.py \
@@ -1480,38 +1401,6 @@ test_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) test_web_js_api_parallel_firefox

-test_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-test_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-test_web_js_api_cross_origin_chrome: browser_kind = chrome
-test_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
-test_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
-
-.PHONY: test_web_js_api_cross_origin_chrome # Run tests for the web wasm api in cross-origin mode on Chrome
-test_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
-
-.PHONY: test_web_js_api_cross_origin_chrome_ci # Run tests for the web wasm api in cross-origin mode on Chrome
-test_web_js_api_cross_origin_chrome_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) test_web_js_api_cross_origin_chrome
-
-test_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-test_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-test_web_js_api_cross_origin_firefox: browser_kind = firefox
-test_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
-test_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeTest  # Only run zk proof tests in cross-origin mode
-
-.PHONY: test_web_js_api_cross_origin_firefox # Run tests for the web wasm api in cross-origin mode on Firefox
-test_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
-
-.PHONY: test_web_js_api_cross_origin_firefox_ci # Run tests for the web wasm api in cross-origin mode on Firefox
-test_web_js_api_cross_origin_firefox_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) test_web_js_api_cross_origin_firefox
-
 WASM_PAR_MQ_TEST_DIR=utils/wasm-par-mq/web_tests

 .PHONY: build_wasm_par_mq_tests # Build the wasm-par-mq test WASM package
@@ -1675,50 +1564,27 @@ bench_integer_rerand_gpu: install_rs_check_toolchain
 	--bench integer-rerand \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_msm_zk
-bench_msm_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench zk-msm \
-	--features=zk-pok -p tfhe-benchmark --profile release --
-
-# GPU benchmarks need --profile release for correct measurements
-.PHONY: bench_msm_zk_gpu
-bench_msm_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench zk-msm \
-	--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release -- zk::cuda::msm
-
-# GPU benchmarks need --profile release for correct measurements
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
-
-# GPU benchmarks need --profile release for correct measurements
-.PHONY: bench_integer_zk_experimental_gpu
-bench_integer_zk_experimental_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
 bench_integer_aes_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes \
-	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
 bench_integer_aes256_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes256 \
-	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
 bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1882,37 +1748,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_parallel_firefox

-bench_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-bench_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-bench_web_js_api_cross_origin_chrome: browser_kind = chrome
-bench_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
-bench_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
+bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
+bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
+bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop

-.PHONY: bench_web_js_api_cross_origin_chrome # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
+.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel

-.PHONY: bench_web_js_api_cross_origin_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_cross_origin_chrome_ci: setup_venv
+.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_cross_origin_chrome
+	$(MAKE) bench_web_js_api_unsafe_coop_chrome

-bench_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-bench_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-bench_web_js_api_cross_origin_firefox: browser_kind = firefox
-bench_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
-bench_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
+bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
+bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
+bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
+bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
+bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop

-.PHONY: bench_web_js_api_cross_origin_firefox # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
+.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel

-.PHONY: bench_web_js_api_cross_origin_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_cross_origin_firefox_ci: setup_venv
+.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_cross_origin_firefox
+	$(MAKE) bench_web_js_api_unsafe_coop_firefox

 .PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
 bench_hlapi_unsigned: install_rs_check_toolchain
@@ -1945,25 +1811,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
-bench_hlapi_erc7984: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
+bench_hlapi_erc20: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
+	--bench hlapi-erc20 \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
-bench_hlapi_erc7984_gpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
+bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
+	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
-bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
+bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
+	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
@@ -1987,13 +1853,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
-bench_hlapi_erc7984_hpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc20_hpu: install_rs_check_toolchain
 	source ./setup_hpu.sh --config $(HPU_CONFIG); \
 	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
+	--bench hlapi-erc20 \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
@@ -2001,13 +1867,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

-.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
-bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--package tfhe-zk-pok \
-	--features=gpu-experimental --profile release
-
 .PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
 bench_hlapi_noise_squash: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
@@ -2049,10 +1908,10 @@ bench_summary: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'

-	# ERC7984
+	# ERC20
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
+	--bench hlapi-erc20 \
 	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'

 	# DEX
@@ -2094,10 +1953,10 @@ bench_summary_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'

-	# ERC7984
+	# ERC20
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
+	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'

 	# DEX
@@ -2276,7 +2135,6 @@ pcc_batch_5:
 	$(call run_recipe_with_details,clippy_tfhe_lints)
 	$(call run_recipe_with_details,check_compile_tests)
 	$(call run_recipe_with_details,clippy_backward_compat_data)
-	$(call run_recipe_with_details,check_backward_compat_locks_did_not_change)

 .PHONY: pcc_batch_6  # duration: 6'32''
 pcc_batch_6:
@@ -2285,10 +2143,8 @@ pcc_batch_6:
 	$(call run_recipe_with_details,clippy_tasks)
 	$(call run_recipe_with_details,clippy_tfhe_csprng)
 	$(call run_recipe_with_details,clippy_zk_pok)
-	$(call run_recipe_with_details,clippy_zk_pok_wasm)
 	$(call run_recipe_with_details,clippy_trivium)
 	$(call run_recipe_with_details,clippy_versionable)
-	$(call run_recipe_with_details,clippy_safe_serialize)
 	$(call run_recipe_with_details,clippy_param_dedup)
 	$(call run_recipe_with_details,docs)

--- a/_typos.toml
+++ b/_typos.toml
@@ -15,3 +15,12 @@ extend-ignore-identifiers-re = [
    "0x[0-9a-fA-F]+",
    "xrt_coreutil",
 ]
+
+[files]
+extend-exclude = [
+    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
+    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
+    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
+    "*.cbor",
+    "*.bcode",
+]
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2026 ZAMA.
+Copyright © 2025 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
+++ b/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
@@ -62,29 +62,3 @@ rules:
            cuda_synchronize_stream(...);
            ...
          }
-
-  - id: tfhe-cuda-unwrapped-cuda-runtime-call
-    message: "CUDA runtime API call is not wrapped in `check_cuda_error(...)`."
-    severity: WARNING
-    languages: [c, cpp]
-    options:
-      generic_ellipsis_max_span: 500
-    paths:
-      include:
-        - "*.cu"
-        - "*.cuh"
-        - "*.cpp"
-        - "*.h"
-      exclude:
-        - backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
-        - backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
-    patterns:
-      - pattern: $FUNC(...)
-      - metavariable-regex:
-          metavariable: $FUNC
-          regex: "^cuda[A-Z][A-Za-z0-9]*$" # matches cudaMalloc/cudaMemcpy/... (not project helpers like cuda_set_device)
-      - pattern-not-inside: check_cuda_error(...)
-      - pattern-not-inside: |
-          $FUNC(...);
-          check_cuda_error(cudaGetLastError());
-      - pattern-not-inside: $FUNC(...) == $VAL
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -36,19 +36,5 @@ void cuda_glwe_sample_extract_128_async(
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
    uint32_t glwe_dimension, uint32_t polynomial_size);
-
-void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
-                                            void *lwe_array_out,
-                                            void *lwe_array_in, uint32_t size,
-                                            uint32_t log_modulus,
-                                            uint32_t degree,
-                                            uint32_t grouping_factor);
-
-void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
-                                             void *lwe_array_out,
-                                             void *lwe_array_in, uint32_t size,
-                                             uint32_t log_modulus,
-                                             uint32_t degree,
-                                             uint32_t grouping_factor);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -382,17 +382,14 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                       ->use_sequential_algorithm_to_resolve_group_carries;

    cuda_set_device(0);
-    check_cuda_error(
-        cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
+    cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming);
    create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
                                    use_seq, allocate_gpu_memory, size_tracker);
-    check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
+    cudaEventRecord(create_indexes_done, streams.stream(0));
    cuda_set_device(1);
-    check_cuda_error(
-        cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
+    cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0);
    cuda_set_device(2);
-    check_cuda_error(
-        cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));
+    cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0);

    scatter_indexes_for_overflowing_sub(
        streams.stream(1), streams.gpu_index(1),
@@ -845,7 +842,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    free(second_indexes_for_overflow_sub_gpu_2);
    free(scalars_for_overflow_sub_gpu_2);

-    check_cuda_error(cudaEventDestroy(create_indexes_done));
+    cudaEventDestroy(create_indexes_done);

    // release sub streams
    sub_streams_1.release();
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -39,28 +39,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
                                                      uint32_t gpu_index,
                                                      int8_t **pbs_buffer);

-// Noise-tests-namespaced wrappers for scratch/cleanup, so that callers
-// working with the noise-tests PBS variant use a consistent naming scheme.
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
-
-// Noise tests variant: 64-bit torus, polynomial_size=2048 only. Uses the
-// NOISE_TESTS keybundle mode for noise analysis purposes.
-void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
@@ -78,23 +56,6 @@ void cuda_multi_bit_programmable_bootstrap_128_async(
 void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
                                                       const uint32_t gpu_index,
                                                       int8_t **buffer);
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
-
-void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lwe_array_in, void const *lwe_input_indexes,
-    void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
  uint32_t num_lwes;
  uint32_t num_compact_lists;

-  int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
-  int_radix_lut<Torus> *identity_lut = nullptr;
+  int_radix_lut<Torus> *message_and_carry_extract_luts;
+  int_radix_lut<Torus> *identity_lut;

-  Torus *tmp_expanded_lwes = nullptr;
-  Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;
+  Torus *tmp_expanded_lwes;
+  Torus *tmp_ksed_small_to_big_expanded_lwes;

  bool gpu_memory_allocated;

@@ -148,6 +148,66 @@ template <typename Torus> struct zk_expand_mem {
      PANIC("GPU backend requires carry_modulus equal to message_modulus")
    }

+    // We create the identity LUT only if we are doing a SANITY_CHECK
+    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+      identity_lut =
+          new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
+                                   allocate_gpu_memory, size_tracker);
+
+      auto identity_lut_f = [](Torus x) -> Torus { return x; };
+
+      identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
+                                               LUT_0_FOR_ALL_BLOCKS);
+    }
+
+    auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
+      return x % casting_params.message_modulus;
+    };
+    auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
+      return (x / casting_params.carry_modulus) %
+             casting_params.message_modulus;
+    };
+
+    // Booleans have to be sanitized
+    auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
+    auto message_extract_and_sanitize_bool_lut_f =
+        [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+      return sanitize_bool_f(message_extract_lut_f(x));
+    };
+    auto carry_extract_and_sanitize_bool_lut_f =
+        [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+      return sanitize_bool_f(carry_extract_lut_f(x));
+    };
+
+    /** In case the casting key casts from BIG to SMALL key we run a single KS
+    to expand using the casting key as ksk. Otherwise, in case the casting key
+    casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
+    the casting key as ksk, then we keyswitch from BIG to SMALL using the
+    computing ksk, and lastly we apply the PBS. The output is always on the
+    BIG key.
+    **/
+    auto params = casting_params;
+    if (casting_key_type == SMALL_TO_BIG) {
+      params = computing_params;
+    }
+    message_and_carry_extract_luts = new int_radix_lut<Torus>(
+        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
+
+    // We are always packing two LWEs. We just need to be sure we have enough
+    // space in the carry part to store a message of the same size as is in the
+    // message part.
+    if (params.carry_modulus < params.message_modulus)
+      PANIC("Carry modulus must be at least as large as message modulus");
+    auto num_packed_msgs = 2;
+
+    // Adjust indexes to permute the output and access the correct LUT
+    auto h_indexes_in = static_cast<Torus *>(
+        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+    auto h_indexes_out = static_cast<Torus *>(
+        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+    auto h_lut_indexes = static_cast<Torus *>(
+        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+
    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
@@ -156,202 +216,144 @@ template <typename Torus> struct zk_expand_mem {
    h_expand_jobs = static_cast<expand_job<Torus> *>(
        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));

-    // NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
-    // no intermediate buffers needed.
-    if (expand_kind != EXPAND_KIND::NO_CASTING) {
-      /** In case the casting key casts from BIG to SMALL key we run a single KS
-      to expand using the casting key as ksk. Otherwise, in case the casting key
-      casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
-      the casting key as ksk, then we keyswitch from BIG to SMALL using the
-      computing ksk, and lastly we apply the PBS. The output is always on the
-      BIG key.
-      **/
-      auto params = casting_params;
-      if (casting_key_type == SMALL_TO_BIG) {
-        params = computing_params;
+    /*
+     * Each LWE contains encrypted data in both carry and message spaces
+     * that needs to be extracted.
+     *
+     * The loop processes each compact list (k) and for each LWE within that
+     * list:
+     * 1. Sets input indexes to read each LWE twice (for carry and message
+     * extraction)
+     * 2. Creates output indexes to properly reorder the results
+     * 3. Selects appropriate LUT index based on whether boolean sanitization is
+     * needed
+     *
+     * We want the output to have always first the content of the message part
+     * and then the content of the carry part of each LWE.
+     *
+     * i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
+     * carry_extract(LWE_1), ...
+     *
+     * Aiming that behavior, with 4 LWEs we would have:
+     *
+     * // Each LWE is processed twice
+     * h_indexes_in   = {0, 1, 2, 3, 0, 1, 2, 3}
+     *
+     * // First 4 use message LUT, last 4 use carry LUT
+     * h_lut_indexes  = {0, 0, 0, 0, 1, 1, 1, 1}
+     *
+     * // Reorders output so message and carry for each LWE appear together
+     * h_indexes_out  = {0, 2, 4, 6, 1, 3, 5, 7}
+     *
+     * If an LWE contains a boolean value, its LUT index is shifted by
+     * num_packed_msgs to use the sanitization LUT (which ensures output is
+     * exactly 0 or 1).
+     */
+    auto offset = 0;
+    for (int k = 0; k < num_compact_lists; k++) {
+      auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
+      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+        auto lwe_index = i + num_packed_msgs * offset;
+        auto lwe_index_in_list = i % num_lwes_in_kth;
+        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       lwe_index, num_packed_msgs * num_lwes);
+        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
+        h_indexes_out[lwe_index] =
+            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
+        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %lu is beyond the max value %lu",
+                       (unsigned long)h_indexes_in[lwe_index],
+                       (unsigned long)(num_packed_msgs * num_lwes));
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %lu is beyond the max value %lu",
+                       (unsigned long)h_indexes_out[lwe_index],
+                       (unsigned long)(num_packed_msgs * num_lwes));
+        // is_boolean_array tells us which input is a boolean and thus the
+        // related output needs boolean sanitization. It naturally has
+        // total_blocks entries, but h_indexes_out reaches
+        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
+        // the ceiling causes out-of-bounds access. Reading garbage "true" would
+        // set h_lut_indexes to an invalid index pointing to uninitialized
+        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
+        // to match.
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                       "Cuda error: index %lu for is_boolean_array is out of "
+                       "bounds (len is %lu)",
+                       (unsigned long)h_indexes_out[lwe_index],
+                       (unsigned long)is_boolean_array_len);
      }
+      offset += num_lwes_in_kth;
+    }

-      // We always pack two LWEs (message and carry parts per LWE)
-      auto num_packed_msgs = 2;
+    message_and_carry_extract_luts->set_lwe_indexes(
+        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);

-      // Adjust indexes to permute the output and access the correct LUT.
-      //
-      // The loop below fills h_indexes_in and h_indexes_out so that the output
-      // is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
-      // msg_extract(LWE_1), carry_extract(LWE_1), ...
-      //
-      // With 4 LWEs the arrays look like:
-      //   h_indexes_in  = {0, 1, 2, 3, 0, 1, 2, 3}  (each LWE read twice)
-      //   h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}  (msg LUT then carry LUT)
-      //   h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}  (interleaved output)
-      //
-      // If an LWE contains a boolean its LUT index is shifted by
-      // num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
-      auto h_indexes_in = static_cast<Torus *>(
-          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-      auto h_indexes_out = static_cast<Torus *>(
-          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+    auto active_streams =
+        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);

+    // Index generator for message/carry extraction LUTs
+    auto index_gen = [num_compact_lists,
+                      num_lwes_per_compact_list =
+                          this->num_lwes_per_compact_list,
+                      num_packed_msgs, is_boolean_array,
+                      h_indexes_out](Torus *h_lut_indexes, uint32_t) {
      auto offset = 0;
      for (int k = 0; k < num_compact_lists; k++) {
-        auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
+        auto num_lwes_in_kth = num_lwes_per_compact_list[k];
        for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
          auto lwe_index = i + num_packed_msgs * offset;
-          auto lwe_index_in_list = i % num_lwes_in_kth;
-          PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
-                         "Cuda error: index %d is beyond the max value %d",
-                         lwe_index, num_packed_msgs * num_lwes);
-          h_indexes_in[lwe_index] = lwe_index_in_list + offset;
-          h_indexes_out[lwe_index] =
-              num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-          PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
-                         "Cuda error: index %lu is beyond the max value %lu",
-                         (unsigned long)h_indexes_in[lwe_index],
-                         (unsigned long)(num_packed_msgs * num_lwes));
-          PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
-                         "Cuda error: index %lu is beyond the max value %lu",
-                         (unsigned long)h_indexes_out[lwe_index],
-                         (unsigned long)(num_packed_msgs * num_lwes));
-          // is_boolean_array tells us which input is a boolean and thus the
-          // related output needs boolean sanitization. It naturally has
-          // total_blocks entries, but h_indexes_out reaches
-          // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
-          // odd, the ceiling causes out-of-bounds access. Reading garbage
-          // "true" would set h_lut_indexes to an invalid index pointing to
-          // uninitialized memory instead of a real LUT. Rust pads
-          // is_boolean_array with FALSE to match.
-          PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
-                         "Cuda error: index %lu for is_boolean_array is out of "
-                         "bounds (len is %lu)",
-                         (unsigned long)h_indexes_out[lwe_index],
-                         (unsigned long)is_boolean_array_len);
+          auto boolean_offset =
+              is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
+          h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
        }
        offset += num_lwes_in_kth;
      }
+    };

-      auto active_streams =
-          streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
+    message_and_carry_extract_luts->generate_and_broadcast_lut(
+        active_streams, {0, 1, 2, 3},
+        {message_extract_lut_f, carry_extract_lut_f,
+         message_extract_and_sanitize_bool_lut_f,
+         carry_extract_and_sanitize_bool_lut_f},
+        index_gen, true, {}, h_lut_indexes);

-      // SANITY_CHECK uses identity_lut (skipping the full message/carry
-      // extraction LUT and the SMALL_TO_BIG intermediate buffer).
-      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-        identity_lut =
-            new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
-                                     allocate_gpu_memory, size_tracker);
+    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
+        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
+    // The expanded LWEs will always be on the casting key format
+    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
+        safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
+        streams.stream(0), streams.gpu_index(0), size_tracker,
+        allocate_gpu_memory);

-        auto identity_lut_f = [](Torus x) -> Torus { return x; };
-        identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
-                                                 LUT_0_FOR_ALL_BLOCKS);
-        identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
-                                      h_indexes_in, h_indexes_out);
-        identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
-            active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
-      } else {
-        // We are always packing two LWEs. We just need to be sure we have
-        // enough space in the carry part to store a message of the same size
-        // as is in the message part.
-        if (params.carry_modulus < params.message_modulus)
-          PANIC("Carry modulus must be at least as large as message modulus");
-
-        message_and_carry_extract_luts =
-            new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
-                                     allocate_gpu_memory, size_tracker);
-        message_and_carry_extract_luts->set_lwe_indexes(
-            streams.stream(0), streams.gpu_index(0), h_indexes_in,
-            h_indexes_out);
-
-        auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
-          return x % casting_params.message_modulus;
-        };
-        auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
-          return (x / casting_params.carry_modulus) %
-                 casting_params.message_modulus;
-        };
-        auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
-        auto message_extract_and_sanitize_bool_lut_f =
-            [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-          return sanitize_bool_f(message_extract_lut_f(x));
-        };
-        auto carry_extract_and_sanitize_bool_lut_f =
-            [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-          return sanitize_bool_f(carry_extract_lut_f(x));
-        };
-
-        auto h_lut_indexes = static_cast<Torus *>(
-            malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-
-        auto index_gen = [num_compact_lists,
-                          num_lwes_per_compact_list =
-                              this->num_lwes_per_compact_list,
-                          num_packed_msgs, is_boolean_array,
-                          h_indexes_out](Torus *h_lut_indexes, uint32_t) {
-          auto offset = 0;
-          for (int k = 0; k < num_compact_lists; k++) {
-            auto num_lwes_in_kth = num_lwes_per_compact_list[k];
-            for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
-              auto lwe_index = i + num_packed_msgs * offset;
-              auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
-                                        ? num_packed_msgs
-                                        : 0;
-              h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
-            }
-            offset += num_lwes_in_kth;
-          }
-        };
-
-        message_and_carry_extract_luts->generate_and_broadcast_lut(
-            active_streams, {0, 1, 2, 3},
-            {message_extract_lut_f, carry_extract_lut_f,
-             message_extract_and_sanitize_bool_lut_f,
-             carry_extract_and_sanitize_bool_lut_f},
-            index_gen, true, {}, h_lut_indexes);
-        message_and_carry_extract_luts
-            ->allocate_lwe_vector_for_non_trivial_indexes(
-                active_streams, 2 * num_lwes, size_tracker,
-                allocate_gpu_memory);
-        free(h_lut_indexes);
-
-        // SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
-        // on the full casting path.
-        tmp_ksed_small_to_big_expanded_lwes =
-            (Torus *)cuda_malloc_with_size_tracking_async(
-                safe_mul_sizeof<Torus>(num_lwes,
-                                       casting_params.big_lwe_dimension + 1),
-                streams.stream(0), streams.gpu_index(0), size_tracker,
-                allocate_gpu_memory);
-      }
-
-      // The expanded LWEs will always be on the casting key format
-      tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-          safe_mul_sizeof<Torus>(num_lwes,
-                                 casting_params.big_lwe_dimension + 1),
-          streams.stream(0), streams.gpu_index(0), size_tracker,
-          allocate_gpu_memory);
-
-      free(h_indexes_in);
-      free(h_indexes_out);
-    }
+    tmp_ksed_small_to_big_expanded_lwes =
+        (Torus *)cuda_malloc_with_size_tracking_async(
+            safe_mul_sizeof<Torus>(num_lwes,
+                                   casting_params.big_lwe_dimension + 1),
+            streams.stream(0), streams.gpu_index(0), size_tracker,
+            allocate_gpu_memory);

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    free(h_indexes_in);
+    free(h_indexes_out);
+    free(h_lut_indexes);
  }

  void release(CudaStreams streams) {
-    if (expand_kind != EXPAND_KIND::NO_CASTING) {
-      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-        identity_lut->release(streams);
-        delete identity_lut;
-      } else {
-        message_and_carry_extract_luts->release(streams);
-        delete message_and_carry_extract_luts;
-        cuda_drop_with_size_tracking_async(
-            tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
-            streams.gpu_index(0), gpu_memory_allocated);
-      }
-      cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
-                                         streams.gpu_index(0),
-                                         gpu_memory_allocated);
+    message_and_carry_extract_luts->release(streams);
+    delete message_and_carry_extract_luts;
+
+    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+      identity_lut->release(streams);
+      delete identity_lut;
    }

+    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
+                                       streams.gpu_index(0),
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
+                                       streams.stream(0), streams.gpu_index(0),
+                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
@@ -390,7 +390,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_a[6], &wires_a[15], &input_bits[7]);
  XOR(&wires_a[10], &wires_a[15], &wires_b[0]);
  XOR(&wires_a[11], &wires_a[20], &wires_a[9]);
-  FLUSH(&wires_a[6], &wires_a[10], &wires_a[11]);
+  FLUSH(&wires_a[6], &wires_a[10]);
  XOR(&wires_a[7], &input_bits[7], &wires_a[11]);
  FLUSH(&wires_a[7]);
  XOR(&wires_a[17], &wires_a[10], &wires_a[11]);
@@ -426,7 +426,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[22], &wires_b[18], &wires_a[19]);
  XOR(&wires_b[23], &wires_b[19], &wires_a[21]);
  XOR(&wires_b[24], &wires_b[20], &wires_a[18]);
-  FLUSH(&wires_b[21], &wires_b[22], &wires_b[23], &wires_b[24]);
+  FLUSH(&wires_b[21], &wires_b[23], &wires_b[24]);
  XOR(&wires_b[25], &wires_b[21], &wires_b[22]);
  FLUSH(&wires_b[25]);

@@ -468,7 +468,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,

  XOR(&wires_b[37], &wires_b[36], &wires_b[34]);
  XOR(&wires_b[38], &wires_b[27], &wires_b[36]);
-  FLUSH(&wires_b[38], &wires_b[37]);
+  FLUSH(&wires_b[38]);
  XOR(&wires_b[44], &wires_b[33], &wires_b[37]);

  CudaRadixCiphertextFFI *and_outs_6[] = {&wires_b[39]};
@@ -479,7 +479,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[40], &wires_b[25], &wires_b[39]);
  XOR(&wires_b[41], &wires_b[40], &wires_b[37]);
  XOR(&wires_b[43], &wires_b[29], &wires_b[40]);
-  FLUSH(&wires_b[41], &wires_b[40], &wires_b[43], &wires_b[44]);
+  FLUSH(&wires_b[41]);
  XOR(&wires_b[45], &wires_b[42], &wires_b[41]);
  FLUSH(&wires_b[45]);

@@ -514,7 +514,6 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[57], &wires_b[50], &wires_b[53]);
  XOR(&wires_b[58], &wires_c[4], &wires_b[46]);
  XOR(&wires_b[59], &wires_c[3], &wires_b[54]);
-  FLUSH(&wires_b[57], &wires_b[58]);
  XOR(&wires_b[60], &wires_b[46], &wires_b[57]);
  XOR(&wires_b[61], &wires_c[14], &wires_b[57]);
  XOR(&wires_b[62], &wires_b[52], &wires_b[58]);
@@ -590,7 +589,6 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
 #undef FLUSH
 #undef AND
 #undef ADD_ONE_FLUSH
-#undef ADD_ONE
 }

 /**
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -150,31 +150,3 @@ void cuda_glwe_sample_extract_128_async(
          "N's are powers of two in the interval [256..4096].")
  }
 }
-
-void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
-                                            void *lwe_array_out,
-                                            void *lwe_array_in, uint32_t size,
-                                            uint32_t log_modulus,
-                                            uint32_t degree,
-                                            uint32_t grouping_factor) {
-
-  host_modulus_switch_multi_bit<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
-      grouping_factor);
-}
-
-void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
-                                             void *lwe_array_out,
-                                             void *lwe_array_in, uint32_t size,
-                                             uint32_t log_modulus,
-                                             uint32_t degree,
-                                             uint32_t grouping_factor) {
-
-  host_modulus_switch_multi_bit<__uint128_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<__uint128_t *>(lwe_array_out),
-      static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
-      grouping_factor);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -463,48 +463,5 @@ __global__ void __launch_bounds__(512)
      return;
  }
 }
-// This function is only used for noise tests, it follows the same logic
-// that is embedded in the keybundle just we need a global function to
-// be able to test it individually.
-template <typename Torus, class params>
-__global__ void
-modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
-                         uint32_t log_modulus, uint32_t grouping_factor) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < size) {
-    int num_monomials = 1 << grouping_factor;
-    int input_offset = tid * grouping_factor;
-    int output_offset = tid * num_monomials;
-    // We calculate all monomials even if the first one is never used.
-    for (int ggsw_idx = 0; ggsw_idx < num_monomials; ggsw_idx++) {
-      array_out[ggsw_idx + output_offset] =
-          calculates_monomial_degree<Torus, params>(&array_in[input_offset],
-                                                    ggsw_idx, grouping_factor);
-    }
-  }
-}
-// This aims to be launched only from the noise tests.
-//  That is why we support a specific set of parameters
-template <typename Torus>
-__host__ void host_modulus_switch_multi_bit(
-    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
-    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  int multibit_size = size / grouping_factor;
-  int num_threads = 0, num_blocks = 0;
-  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
-  switch (degree) {
-  case 2048:
-    modulus_switch_multi_bit<Torus, Degree<2048>>
-        <<<num_blocks, num_threads, 0, stream>>>(
-            array_out, array_in, multibit_size, log_modulus, grouping_factor);
-    break;
-  default:
-    PANIC("Cuda error: unsupported polynomial size. Supported "
-          "N's are powers of two in the interval [2048].")
-  };
-
-  check_cuda_error(cudaGetLastError());
-}

 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -326,10 +326,6 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index) {
  if (size == 0)
    return;
-
-  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
-  GPU_ASSERT(dest != nullptr, "Cuda error: null device ptr");
-
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
  PANIC_IF_FALSE(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -373,8 +373,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  check_cuda_error(
-      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -420,39 +420,6 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
  }
 }

-// Noise tests variant: identical to host_cg_multi_bit_programmable_bootstrap
-// but uses NOISE_TESTS keybundle mode.
-template <typename Torus, class params>
-__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    // Compute a keybundle with NOISE_TESTS mode instead of GENERIC
-    execute_compute_keybundle_noise_tests<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
-
-    execute_cg_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
-  }
-}
-
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
@@ -517,8 +484,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  check_cuda_error(
-      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -784,9 +784,9 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc_128(
      device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
      cudaFuncAttributeMaxDynamicSharedMemorySize,
      full_sm)); // full_sm + minimum_sm_tbc));
-  check_cuda_error(cudaFuncSetCacheConfig(
+  cudaFuncSetCacheConfig(
      device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
-      cudaFuncCachePreferShared));
+      cudaFuncCachePreferShared);
  check_cuda_error(cudaFuncSetAttribute(
      device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
      cudaFuncAttributeNonPortableClusterSizeAllowed, true));
@@ -1271,8 +1271,7 @@ __host__ bool verify_cuda_programmable_bootstrap_128_cg_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  check_cuda_error(
-      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);

  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -645,103 +645,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
  *buffer = nullptr;
 }

-// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
-// that callers using the noise-tests PBS variant have a consistent API.
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-  return scratch_cuda_multi_bit_programmable_bootstrap_64_async(
-      stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-      level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
-}
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
-  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
-                                                   pbs_buffer);
-}
-
-// Noise tests variant of the 64-bit multi-bit PBS, restricted to
-// polynomial_size=2048. The main difference is that the input
-// is assumed to be modulus switched before bootstrapping.
-void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
-
-  PANIC_IF_FALSE(num_samples == 1,
-                 "Cuda error (multi-bit PBS): num_samples (%d) should be 1",
-                 num_samples);
-
-  PANIC_IF_FALSE(base_log <= 64,
-                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
-                 base_log);
-  PANIC_IF_FALSE(polynomial_size == 2048,
-                 "Cuda error (multi-bit PBS noise tests): only polynomial "
-                 "size 2048 is supported, got %d.",
-                 polynomial_size);
-
-  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
-      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
-
-  switch (buffer->pbs_variant) {
-  case PBS_VARIANT::TBC:
-#if CUDA_ARCH >= 900
-  {
-    host_tbc_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
-                                                          Degree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_output_indexes),
-        static_cast<const uint64_t *>(lut_vector),
-        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(lwe_input_indexes),
-        static_cast<const uint64_t *>(bootstrapping_key), buffer,
-        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-        base_log, level_count, num_samples, num_many_lut, lut_stride);
-  } break;
-#else
-    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
-#endif
-  case PBS_VARIANT::CG:
-    host_cg_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
-                                                         Degree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_output_indexes),
-        static_cast<const uint64_t *>(lut_vector),
-        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(lwe_input_indexes),
-        static_cast<const uint64_t *>(bootstrapping_key), buffer,
-        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-        base_log, level_count, num_samples, num_many_lut, lut_stride);
-    break;
-  case PBS_VARIANT::DEFAULT:
-    host_multi_bit_programmable_bootstrap_noise_tests<uint64_t, Degree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_output_indexes),
-        static_cast<const uint64_t *>(lut_vector),
-        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(lwe_input_indexes),
-        static_cast<const uint64_t *>(bootstrapping_key), buffer,
-        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-        base_log, level_count, num_samples, num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
-  }
-}
-
 /**
 * Computes divisors of the product of num_sms (streaming multiprocessors on the
 * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -25,8 +25,7 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
         level_count;
 }

-template <typename Torus, class params, sharedMemDegree SMD,
-          bool runs_noise_test = false>
+template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -56,6 +55,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {

+    const Torus *block_lwe_array_in =
+        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
    double2 *keybundle = keybundle_array +
                         // select the input
                         input_idx * keybundle_size_per_input;
@@ -84,40 +86,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      if constexpr (runs_noise_test == true) {
-        // For noise tests the input array contains the input lwe but also the
-        // modswitched results. This allows to avoid changing the accumulation
-        // kernel for the noise tests since the input body will stay in the same
-        // position. The layout of the input array is the following:
-        // | input lwe     | modswitched inputs       |
-        // | lwe size      | lwe_size*grouping_factor |
-
-        // This offset allows to jump directly to the modswitched inputs,
-        // skipping the input lwe
-        const Torus modswitched_offset = lwe_dimension + 1;
-
-        const Torus *block_lwe_array_in =
-            &lwe_array_in[lwe_input_indexes[input_idx] *
-                              (lwe_dimension / grouping_factor) *
-                              (1 << grouping_factor) +
-                          modswitched_offset];
-
-        const Torus *lwe_array_group =
-            block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
-        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
-
-      } else {
-        // In production we calculate the monomial degrees on the fly, since
-        // they are not stored in the input array.
-        const Torus *block_lwe_array_in =
-            &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
-
-        const Torus *lwe_array_group =
-            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-        monomial_degrees[threadIdx.x] =
-            calculates_monomial_degree<Torus, params>(
-                lwe_array_group, threadIdx.x, grouping_factor);
-      }
+      const Torus *lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, threadIdx.x, grouping_factor);
    }
    __syncthreads();

@@ -173,8 +145,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 // Then we can just calculate the offset needed to apply this coefficients, and
 // the operation transforms into a pointwise vector multiplication, avoiding to
 // perform extra instructions other than MADD
-template <typename Torus, class params, sharedMemDegree SMD,
-          bool runs_noise_test = false>
+template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -248,40 +219,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;

    if (threadIdx.x < (1 << grouping_factor)) {
-      if constexpr (runs_noise_test == true) {
-        // For noise tests the input array contains the input lwe but also the
-        // modswitched results. This allows to avoid changing the accumulation
-        // kernel for the noise tests since the input body will stay in the same
-        // position. The layout of the input array is the following:
-        // | input lwe     | modswitched inputs       |
-        // | lwe size      | lwe_size*grouping_factor |
-
-        // This offset allows to jump directly to the modswitched inputs,
-        // skipping the input lwe
-        const Torus modswitched_offset = lwe_dimension + 1;
-
-        const Torus *block_lwe_array_in =
-            &lwe_array_in[lwe_input_indexes[input_idx] *
-                              (lwe_dimension / grouping_factor) *
-                              (1 << grouping_factor) +
-                          modswitched_offset];
-
-        const Torus *lwe_array_group =
-            block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
-        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
-
-      } else {
-        // In production we calculate the monomial degrees on the fly, since
-        // they are not stored in the input array.
-        const Torus *block_lwe_array_in =
-            &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
-
-        const Torus *lwe_array_group =
-            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-        monomial_degrees[threadIdx.x] =
-            calculates_monomial_degree<Torus, params>(
-                lwe_array_group, threadIdx.x, grouping_factor);
-      }
+      const Torus *lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, threadIdx.x, grouping_factor);
    }
    __syncthreads();

@@ -721,7 +662,6 @@ enum class MultiBitKeybundleLaunchMode {
  AUTO,
  GENERIC,
  SPECIALIZED_2_2,
-  NOISE_TESTS,
 };

 template <typename Torus, class params>
@@ -786,65 +726,30 @@ __host__ void execute_compute_keybundle_with_mode(
    bool use_specialized =
        launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
        (launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
-         can_use_specialized) ||
-        (launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS &&
         can_use_specialized);
-    bool use_noise_test_template =
-        launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS;
    if (use_specialized) {
      dim3 thds_new_keybundle(512, 1, 1);
-      if (use_noise_test_template) {
-        // Set up the noise-test variant of the specialized 2_2 kernel
-        check_cuda_error(cudaFuncSetAttribute(
-            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-                Torus, Degree<2048>, FULLSM, true>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize,
-            3 * full_sm_keybundle));
-        check_cuda_error(cudaFuncSetCacheConfig(
-            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-                Torus, Degree<2048>, FULLSM, true>,
-            cudaFuncCachePreferShared));
-        check_cuda_error(cudaGetLastError());
-        device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-            Torus, Degree<2048>, FULLSM, true>
-            <<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
-               stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
-                         bootstrapping_key, lwe_dimension, lwe_offset,
-                         chunk_size, keybundle_size_per_input);
-      } else {
-        check_cuda_error(cudaFuncSetAttribute(
-            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-                Torus, Degree<2048>, FULLSM>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize,
-            3 * full_sm_keybundle));
-        check_cuda_error(cudaFuncSetCacheConfig(
-            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-                Torus, Degree<2048>, FULLSM>,
-            cudaFuncCachePreferShared));
-        check_cuda_error(cudaGetLastError());
-        device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-            Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
-                                           3 * full_sm_keybundle, stream>>>(
-            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-            lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
-      }
+      check_cuda_error(cudaFuncSetAttribute(
+          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+              Torus, Degree<2048>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+              Torus, Degree<2048>, FULLSM>,
+          cudaFuncCachePreferShared));
+      check_cuda_error(cudaGetLastError());
+      device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+          Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
+                                         3 * full_sm_keybundle, stream>>>(
+          lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+          lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
    } else {
-      if (use_noise_test_template) {
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM,
-                                                          true>
-            <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
-                lwe_array_in, lwe_input_indexes, keybundle_fft,
-                bootstrapping_key, lwe_dimension, glwe_dimension,
-                polynomial_size, grouping_factor, level_count, lwe_offset,
-                chunk_size, keybundle_size_per_input, d_mem, 0);
-      } else {
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
-            <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
-                lwe_array_in, lwe_input_indexes, keybundle_fft,
-                bootstrapping_key, lwe_dimension, glwe_dimension,
-                polynomial_size, grouping_factor, level_count, lwe_offset,
-                chunk_size, keybundle_size_per_input, d_mem, 0);
-      }
+      device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
+          <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+              lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+              lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+              level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+              d_mem, 0);
    }
  }
  check_cuda_error(cudaGetLastError());
@@ -891,20 +796,6 @@ __host__ void execute_compute_keybundle_2_2_specialized(
      grouping_factor, level_count, lwe_offset,
      MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
 }
-// Used only to run noise tests
-template <typename Torus, class params>
-__host__ void execute_compute_keybundle_noise_tests(
-    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
-  execute_compute_keybundle_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-      grouping_factor, level_count, lwe_offset,
-      MultiBitKeybundleLaunchMode::NOISE_TESTS);
-}

 template <typename Torus, class params, bool is_first_iter>
 __host__ void execute_step_one(
@@ -1064,62 +955,4 @@ __host__ void host_multi_bit_programmable_bootstrap(
    }
  }
 }
-
-template <typename Torus, class params>
-__host__ void host_multi_bit_programmable_bootstrap_noise_tests(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    // Compute a keybundle with NOISE_TESTS mode to enable the specialized
-    // runs_noise_test=true kernel variant for noise measurement
-    execute_compute_keybundle_with_mode<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset,
-        MultiBitKeybundleLaunchMode::NOISE_TESTS);
-    // Accumulate (same as standard path)
-    uint32_t chunk_size =
-        std::min((uint32_t)lwe_chunk_size,
-                 (lwe_dimension / grouping_factor) - lwe_offset);
-    for (uint32_t j = 0; j < chunk_size; j++) {
-      bool is_first_iter = (j + lwe_offset) == 0;
-      bool is_last_iter =
-          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
-      if (is_first_iter) {
-        execute_step_one<Torus, params, true>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
-      } else {
-        execute_step_one<Torus, params, false>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
-      }
-
-      if (is_last_iter) {
-        execute_step_two<Torus, params, true>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      } else {
-        execute_step_two<Torus, params, false>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      }
-    }
-  }
-}
 #endif // MULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -293,81 +293,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
  *buffer = nullptr;
 }

-// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
-// that callers using the noise-tests PBS128 variant have a consistent API.
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-  return scratch_cuda_multi_bit_programmable_bootstrap_128_async(
-      stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-      level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
-}
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
-  cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
-                                                    pbs_buffer);
-  cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
-}
-
-// Noise tests variant of the 128-bit multi-bit PBS, restricted to
-// polynomial_size=2048. The input is assumed to contain precomputed
-// modswitched values in the extended input array layout.
-void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lwe_array_in, void const *lwe_input_indexes,
-    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  PANIC_IF_FALSE(num_samples == 1,
-                 "Cuda error (multi-bit PBS): num_samples (%d) should be 1",
-                 num_samples);
-  PANIC_IF_FALSE(base_log <= 64,
-                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
-                 base_log);
-  PANIC_IF_FALSE(polynomial_size == 2048,
-                 "Cuda error (multi-bit PBS128 noise tests): only polynomial "
-                 "size 2048 is supported, got %d.",
-                 polynomial_size);
-
-  auto *buffer =
-      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
-  switch (buffer->pbs_variant) {
-  case PBS_VARIANT::CG:
-    host_cg_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
-                                                             Degree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_output_indexes),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(lwe_input_indexes),
-        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
-        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-        base_log, level_count, num_samples, num_many_lut, lut_stride);
-    break;
-  case PBS_VARIANT::DEFAULT:
-    host_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
-                                                          Degree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_output_indexes),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(lwe_input_indexes),
-        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
-        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-        base_log, level_count, num_samples, num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
-  }
-}
-
 /**
 * Computes divisors of the product of num_sms (streaming multiprocessors on the
 * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -18,8 +18,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
                                      (size_t)2); // accumulator
 }

-template <typename InputTorus, class params, sharedMemDegree SMD,
-          bool runs_noise_test = false>
+template <typename InputTorus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
    const InputTorus *__restrict__ lwe_array_in,
    const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
@@ -81,35 +80,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      if constexpr (runs_noise_test == true) {
-        // For noise tests the input array contains the input lwe but also the
-        // modswitched results. This allows to avoid changing the accumulation
-        // kernel for the noise tests since the input body will stay in the same
-        // position. The layout of the input array is the following:
-        // | input lwe     | modswitched inputs       |
-        // | lwe size      | lwe_size*grouping_factor |
-
-        // This offset allows to jump directly to the modswitched inputs,
-        // skipping the input lwe
-        const InputTorus modswitched_offset = lwe_dimension + 1;
-
-        const InputTorus *block_lwe_array_in_noise =
-            &lwe_array_in[lwe_input_indexes[input_idx] *
-                              (lwe_dimension / grouping_factor) *
-                              (1 << grouping_factor) +
-                          modswitched_offset];
-
-        const InputTorus *lwe_array_group =
-            block_lwe_array_in_noise +
-            rev_lwe_iteration * (1 << grouping_factor);
-        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
-      } else {
-        auto lwe_array_group =
-            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-        monomial_degrees[threadIdx.x] =
-            calculates_monomial_degree<InputTorus, params>(
-                lwe_array_group, threadIdx.x, grouping_factor);
-      }
+      auto lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] =
+          calculates_monomial_degree<InputTorus, params>(
+              lwe_array_group, threadIdx.x, grouping_factor);
    }
    __syncthreads();

@@ -613,74 +588,6 @@ __host__ void execute_compute_keybundle_128(
  check_cuda_error(cudaGetLastError());
 }

-// Used only to run noise tests: launches the keybundle kernel with the
-// runs_noise_test=true variant, which reads modswitched inputs from the
-// extended input array layout instead of computing them on-the-fly
-template <typename InputTorus, class params>
-__host__ void execute_compute_keybundle_noise_tests_128(
-    cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
-    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
-  cuda_set_device(gpu_index);
-
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-  uint64_t chunk_size = std::min(
-      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
-
-  uint64_t keybundle_size_per_input =
-      lwe_chunk_size * level_count * (glwe_dimension + 1) *
-      (glwe_dimension + 1) * (polynomial_size / 2) * 4;
-
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
-          __uint128_t>(polynomial_size);
-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-
-  auto d_mem = buffer->d_mem_keybundle;
-  auto keybundle_fft = buffer->keybundle_fft;
-
-  dim3 grid_keybundle(num_samples * chunk_size,
-                      (glwe_dimension + 1) * (glwe_dimension + 1), level_count);
-  dim3 thds(polynomial_size / params::opt, 1, 1);
-
-  if (max_shared_memory < full_sm_keybundle) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle_128<
-            InputTorus, params, NOSM, true>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle_128<
-            InputTorus, params, NOSM, true>,
-        cudaFuncCachePreferShared));
-    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
-                                                          NOSM, true>
-        <<<grid_keybundle, thds, 0, stream>>>(
-            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
-            d_mem, full_sm_keybundle);
-  } else {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle_128<
-            InputTorus, params, FULLSM, true>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle_128<
-            InputTorus, params, FULLSM, true>,
-        cudaFuncCachePreferShared));
-    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
-                                                          FULLSM, true>
-        <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
-            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
-            d_mem, 0);
-  }
-  check_cuda_error(cudaGetLastError());
-}
-
 template <typename InputTorus, class params, bool is_first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
@@ -1212,47 +1119,46 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
  int max_active_blocks_per_sm;

  if (max_shared_memory < partial_sm_cg_accumulate) {
-    check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            Torus, params, NOSM>,
-        thds, 0));
+        thds, 0);
  } else if (max_shared_memory < full_sm_cg_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
-    check_cuda_error(cudaFuncSetCacheConfig(
+    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  PARTIALSM>,
-        cudaFuncCachePreferShared));
-    check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        cudaFuncCachePreferShared);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            Torus, params, PARTIALSM>,
-        thds, partial_sm_cg_accumulate));
+        thds, partial_sm_cg_accumulate);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
-    check_cuda_error(cudaFuncSetCacheConfig(
+    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
                                                                  FULLSM>,
-        cudaFuncCachePreferShared));
-    check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        cudaFuncCachePreferShared);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            Torus, params, FULLSM>,
-        thds, full_sm_cg_accumulate));
+        thds, full_sm_cg_accumulate);
    check_cuda_error(cudaGetLastError());
  }

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  check_cuda_error(
-      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

@@ -1293,96 +1199,4 @@ supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
  }
 }

-// Noise tests variant: identical to
-// host_cg_multi_bit_programmable_bootstrap_128 but uses the noise-test
-// keybundle (runs_noise_test=true) instead of the standard one.
-template <typename InputTorus, class params>
-__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests_128(
-    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
-    __uint128_t const *bootstrapping_key,
-    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    // Compute a keybundle with the noise-test kernel variant
-    // (runs_noise_test=true) to read precomputed modswitched values
-    execute_compute_keybundle_noise_tests_128<InputTorus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
-
-    execute_cg_external_product_loop_128<InputTorus, params>(
-        stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
-        lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
-        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        lwe_offset, num_many_lut, lut_stride);
-  }
-}
-
-template <typename InputTorus, class params>
-__host__ void host_multi_bit_programmable_bootstrap_noise_tests_128(
-    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
-    __uint128_t const *bootstrapping_key,
-    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    // Compute a keybundle with the noise-test kernel variant
-    // (runs_noise_test=true) to read precomputed modswitched values
-    execute_compute_keybundle_noise_tests_128<InputTorus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
-
-    // Accumulate (same as standard path)
-    uint64_t chunk_size =
-        std::min((uint32_t)lwe_chunk_size,
-                 (lwe_dimension / grouping_factor) - lwe_offset);
-    for (uint32_t j = 0; j < chunk_size; j++) {
-      bool is_first_iter = (j + lwe_offset) == 0;
-      bool is_last_iter =
-          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
-      if (is_first_iter) {
-        execute_step_one_128<InputTorus, params, true>(
-            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
-            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-            base_log, level_count);
-      } else {
-        execute_step_one_128<InputTorus, params, false>(
-            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
-            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-            base_log, level_count);
-      }
-
-      if (is_last_iter) {
-        execute_step_two_128<InputTorus, params, true>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      } else {
-        execute_step_two_128<InputTorus, params, false>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      }
-    }
-  }
-}
-
 #endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -739,8 +739,7 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(

  // Get the number of streaming multiprocessors
  int number_of_sm = 0;
-  check_cuda_error(
-      cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -795,40 +795,6 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
      MultiBitTbcLaunchMode::SPECIALIZED_2_2);
 }

-// Noise tests variant: uses NOISE_TESTS keybundle mode for the keybundle step
-// while keeping the standard AUTO accumulate behaviour for the TBC loop.
-template <typename Torus, class params>
-__host__ void host_tbc_multi_bit_programmable_bootstrap_noise_tests(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  cuda_set_device(gpu_index);
-
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    // Keybundle with NOISE_TESTS mode; the TBC accumulate uses AUTO as usual
-    execute_compute_keybundle_noise_tests<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
-
-    // Accumulate (unchanged from standard TBC path)
-    execute_tbc_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride, MultiBitTbcLaunchMode::AUTO);
-  }
-}
-
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t max_shared_memory) {
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -119,73 +119,71 @@ __host__ void host_expand_without_verification(
      streams.stream(0), streams.gpu_index(0), true);

  if (mem_ptr->expand_kind == EXPAND_KIND::NO_CASTING) {
-    // This path is added to mimic the CPU fallback behaviour for the no_casting
-    // expand, which is needed for the noise sanity checks.
    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
                                   lwe_array_out, d_expand_jobs, num_lwes);
-
-  } else {
-    // This is our default path for the expand with casting if needed.
-    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
-                                   expanded_lwes, d_expand_jobs, num_lwes);
-
-    auto lwe_array_input = expanded_lwes;
-    auto ksks = casting_keys;
-    auto message_and_carry_extract_luts =
-        mem_ptr->message_and_carry_extract_luts;
-
-    auto lut = mem_ptr->message_and_carry_extract_luts;
-    if (casting_key_type == SMALL_TO_BIG) {
-      if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
-        PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
-      }
-      // Keyswitch from small to big key if needed
-      auto ksed_small_to_big_expanded_lwes =
-          mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
-      std::vector<Torus *> lwe_trivial_indexes_vec =
-          lut->lwe_trivial_indexes_vec;
-
-      auto casting_params = mem_ptr->casting_params;
-      auto casting_output_dimension = casting_params.big_lwe_dimension;
-      auto casting_input_dimension = casting_params.small_lwe_dimension;
-      auto casting_ks_level = casting_params.ks_level;
-      auto casting_ks_base_log = casting_params.ks_base_log;
-
-      // apply keyswitch to BIG
-      execute_keyswitch_async<Torus>(
-          streams.get_ith(0), ksed_small_to_big_expanded_lwes,
-          lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
-          casting_keys, casting_input_dimension, casting_output_dimension,
-          casting_ks_base_log, casting_ks_level, num_lwes,
-          lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
-
-      // In this case, the next keyswitch will use the compute ksk
-      ksks = compute_ksks;
-      lwe_array_input = ksed_small_to_big_expanded_lwes;
-    }
-
-    // Apply LUT
-    cuda_memset_async(lwe_array_out, 0,
-                      safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
-                                             (size_t)num_lwes, (size_t)2),
-                      streams.stream(0), streams.gpu_index(0));
-    CudaRadixCiphertextFFI output;
-    into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
-    CudaRadixCiphertextFFI input;
-    into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
-    // This is a special case only for our noise sanity checks
-    // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
-    // This replicates the CPU fallback behaviour of the casting expand
-    auto final_lut = (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK
-                          ? mem_ptr->identity_lut
-                          : message_and_carry_extract_luts);
-
-    integer_radix_apply_univariate_lookup_table<Torus>(
-        streams, &output, &input, bsks, ksks, final_lut, 2 * num_lwes);
-
-    release_cpu_radix_ciphertext_async(&input);
-    release_cpu_radix_ciphertext_async(&output);
+    return;
  }
+
+  host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
+                                 expanded_lwes, d_expand_jobs, num_lwes);
+
+  auto lwe_array_input = expanded_lwes;
+  auto ksks = casting_keys;
+  auto message_and_carry_extract_luts = mem_ptr->message_and_carry_extract_luts;
+
+  auto lut = mem_ptr->message_and_carry_extract_luts;
+  if (casting_key_type == SMALL_TO_BIG) {
+    if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
+      PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
+    }
+    // Keyswitch from small to big key if needed
+    auto ksed_small_to_big_expanded_lwes =
+        mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    auto casting_params = mem_ptr->casting_params;
+    auto casting_output_dimension = casting_params.big_lwe_dimension;
+    auto casting_input_dimension = casting_params.small_lwe_dimension;
+    auto casting_ks_level = casting_params.ks_level;
+    auto casting_ks_base_log = casting_params.ks_base_log;
+
+    // apply keyswitch to BIG
+    execute_keyswitch_async<Torus>(
+        streams.get_ith(0), ksed_small_to_big_expanded_lwes,
+        lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
+        casting_keys, casting_input_dimension, casting_output_dimension,
+        casting_ks_base_log, casting_ks_level, num_lwes,
+        lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
+
+    // In this case, the next keyswitch will use the compute ksk
+    ksks = compute_ksks;
+    lwe_array_input = ksed_small_to_big_expanded_lwes;
+  }
+
+  // Apply LUT
+  cuda_memset_async(lwe_array_out, 0,
+                    safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
+                                           (size_t)num_lwes, (size_t)2),
+                    streams.stream(0), streams.gpu_index(0));
+  CudaRadixCiphertextFFI output;
+  into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
+  CudaRadixCiphertextFFI input;
+  into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
+  // This is a special case only for our noise sanity checks
+  // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
+  // This replicates the CPU fallback behaviour of the casting expand
+  if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
+    integer_radix_apply_univariate_lookup_table<Torus>(
+        streams, &output, &input, bsks, ksks, mem_ptr->identity_lut,
+        2 * num_lwes);
+    return;
+  }
+
+  integer_radix_apply_univariate_lookup_table<Torus>(
+      streams, &output, &input, bsks, ksks, message_and_carry_extract_luts,
+      2 * num_lwes);
+  release_cpu_radix_ciphertext_async(&input);
+  release_cpu_radix_ciphertext_async(&output);
  compact_lwe_lists.release();
 }

--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -79,30 +79,6 @@ unsafe extern "C" {
        polynomial_size: u32,
    );
 }
-unsafe extern "C" {
-    pub fn cuda_modulus_switch_multi_bit_64_async(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_in: *mut ffi::c_void,
-        size: u32,
-        log_modulus: u32,
-        degree: u32,
-        grouping_factor: u32,
-    );
-}
-unsafe extern "C" {
-    pub fn cuda_modulus_switch_multi_bit_128_async(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_in: *mut ffi::c_void,
-        size: u32,
-        log_modulus: u32,
-        degree: u32,
-        grouping_factor: u32,
-    );
-}
 pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
 pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
 pub type PBS_TYPE = ffi::c_uint;
@@ -136,6 +112,9 @@ pub type Direction = ffi::c_uint;
 pub const BitValue_Zero: BitValue = 0;
 pub const BitValue_One: BitValue = 1;
 pub type BitValue = ffi::c_uint;
+pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
+pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
+pub type RERAND_MODE = ffi::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct CudaStreamsFFI {
@@ -2476,9 +2455,6 @@ unsafe extern "C" {
        glwe_index: u32,
    );
 }
-pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
-pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
-pub type RERAND_MODE = ffi::c_uint;
 unsafe extern "C" {
    pub fn scratch_cuda_rerand_64_async(
        streams: CudaStreamsFFI,
@@ -2491,7 +2467,7 @@ unsafe extern "C" {
        message_modulus: u32,
        carry_modulus: u32,
        allocate_gpu_memory: bool,
-        rerand_type: RERAND_MODE,
+        rerand_type: u32,
    ) -> u64;
 }
 unsafe extern "C" {
@@ -3391,48 +3367,6 @@ unsafe extern "C" {
        pbs_buffer: *mut *mut i8,
    );
 }
-unsafe extern "C" {
-    pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        level_count: u32,
-        input_lwe_ciphertext_count: u32,
-        allocate_gpu_memory: bool,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_output_indexes: *const ffi::c_void,
-        lut_vector: *const ffi::c_void,
-        lut_vector_indexes: *const ffi::c_void,
-        lwe_array_in: *const ffi::c_void,
-        lwe_input_indexes: *const ffi::c_void,
-        bootstrapping_key: *const ffi::c_void,
-        buffer: *mut i8,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        grouping_factor: u32,
-        base_log: u32,
-        level_count: u32,
-        num_samples: u32,
-        num_many_lut: u32,
-        lut_stride: u32,
-    );
-}
 unsafe extern "C" {
    pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_async(
        stream: *mut ffi::c_void,
@@ -3474,44 +3408,3 @@ unsafe extern "C" {
        buffer: *mut *mut i8,
    );
 }
-unsafe extern "C" {
-    pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        level_count: u32,
-        input_lwe_ciphertext_count: u32,
-        allocate_gpu_memory: bool,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
-        stream: *mut ffi::c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_output_indexes: *const ffi::c_void,
-        lut_vector: *const ffi::c_void,
-        lwe_array_in: *const ffi::c_void,
-        lwe_input_indexes: *const ffi::c_void,
-        bootstrapping_key: *const ffi::c_void,
-        buffer: *mut i8,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        grouping_factor: u32,
-        base_log: u32,
-        level_count: u32,
-        num_samples: u32,
-        num_many_lut: u32,
-        lut_stride: u32,
-    );
-}
--- a/backends/tfhe-hpu-backend/Cargo.toml
+++ b/backends/tfhe-hpu-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-hpu-backend"
-version = "0.5.0"
+version = "0.4.0"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "HPU implementation on FPGA of TFHE-rs primitives."
@@ -36,7 +36,7 @@ thiserror = "1.0.61"
 bytemuck = { workspace = true }
 anyhow = "1.0.82"
 lazy_static = "1.4.0"
-rand = "0.10.1"
+rand = "0.8.5"
 regex = "1.10.4"
 bitflags = { version = "2.5.0", features = ["serde"] }
 itertools = "0.11.0"
--- a/backends/tfhe-hpu-backend/LICENSE
+++ b/backends/tfhe-hpu-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2026 ZAMA.
+Copyright © 2025 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-hpu-backend/README.md
+++ b/backends/tfhe-hpu-backend/README.md
@@ -297,8 +297,8 @@ source setup_hpu.sh --config v80 -p
 # Run hlapi benches
 make test_high_level_api_hpu

-# Run hlapi erc7984 benches
-make bench_hlapi_erc7984_hpu 
+# Run hlapi erc20 benches
+make bench_hlapi_erc20_hpu 

 # Run integer level benches
 make bench_integer_hpu
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
@@ -109,7 +109,7 @@
  flush_behaviour = "Patient"
  flush = true

-[firmware.op_cfg.by_op.ERC_7984]
+[firmware.op_cfg.by_op.ERC_20]
  fill_batch_fifo = true
  min_batch_size = false
  use_tiers = true
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -121,7 +121,7 @@
  flush_behaviour = "Patient"
  flush = true

-[firmware.op_cfg.by_op.ERC_7984]
+[firmware.op_cfg.by_op.ERC_20]
  fill_batch_fifo = true
  min_batch_size = false
  use_tiers = true
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -230,7 +230,7 @@ iop!(
    [IOP_CMP -> "CMP_NEQ", opcode::CMP_NEQ],
    [IOP_CT_F_CT_BOOL -> "IF_THEN_ZERO", opcode::IF_THEN_ZERO],
    [IOP_CT_F_2CT_BOOL -> "IF_THEN_ELSE", opcode::IF_THEN_ELSE],
-    [IOP_2CT_F_3CT -> "ERC_7984", opcode::ERC_7984],
+    [IOP_2CT_F_3CT -> "ERC_20", opcode::ERC_20],
    [IOP_CT_F_CT -> "MEMCPY", opcode::MEMCPY],
    [IOP_CT_F_CT -> "ILOG2", opcode::ILOG2],
    [IOP_CT_F_CT -> "COUNT0", opcode::COUNT0],
@@ -240,5 +240,5 @@ iop!(
    [IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
    [IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
    [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
-    [IOP_2NCT_F_3NCT -> "ERC_7984_SIMD", opcode::ERC_7984_SIMD],
+    [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
 );
--- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
@@ -74,9 +74,9 @@ pub const IF_THEN_ZERO: u8 = 0xCA;
 pub const IF_THEN_ELSE: u8 = 0xCB;

 // Custom algorithm
-// ERC7984 -> Found xfer algorithm
+// ERC20 -> Found xfer algorithm
 // 2Ct <- func(3Ct)
-pub const ERC_7984: u8 = 0x80;
+pub const ERC_20: u8 = 0x80;

 // Count bits
 pub const COUNT0: u8 = 0x81;
@@ -89,7 +89,7 @@ pub const TRAIL1: u8 = 0x87;

 // SIMD for maximum throughput
 pub const ADD_SIMD: u8 = 0xF0;
-pub const ERC_7984_SIMD: u8 = 0xF1;
+pub const ERC_20_SIMD: u8 = 0xF1;
 //
 // Utility operations
 // Used to handle real clone of ciphertext already uploaded in the Hpu memory
--- a/backends/tfhe-hpu-backend/src/ffi/v80/mod.rs
+++ b/backends/tfhe-hpu-backend/src/ffi/v80/mod.rs
@@ -24,7 +24,7 @@ use mem_alloc::{MemAlloc, MemChunk};

 mod qdma;
 use qdma::QdmaDriver;
-use rand::RngExt;
+use rand::Rng;

 const DMA_XFER_ALIGN: usize = 4096_usize;

@@ -148,8 +148,8 @@ impl HpuHw {
        tracing::debug!("Load stage1 through JTAG");
        let pdi_stg1_tmp = format!(
            "hpu_stg1_{}.pdi",
-            rand::rng()
-                .sample_iter(rand::distr::Alphanumeric)
+            rand::thread_rng()
+                .sample_iter(rand::distributions::Alphanumeric)
                .take(5)
                .map(char::from)
                .collect::<String>()
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs
@@ -31,7 +31,7 @@ crate::impl_fw!("Demo" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_7984 => fw_impl::ilp::iop_erc_7984;
+    ERC_20 => fw_impl::ilp::iop_erc_20;

    CMP_GT  => cmp_gt;
    CMP_GTE => cmp_gte;
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
@@ -61,7 +61,7 @@ crate::impl_fw!("Ilp" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_7984 => fw_impl::ilp::iop_erc_7984;
+    ERC_20 => fw_impl::ilp::iop_erc_20;

    MEMCPY => fw_impl::ilp::iop_memcpy;

@@ -74,7 +74,7 @@ crate::impl_fw!("Ilp" [
    TRAIL1 => fw_impl::ilp_log::iop_trail1;
    // SIMD Implementations
    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_7984_SIMD  => fw_impl::llt::iop_erc_7984_simd;
+    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
 ]);

 #[instrument(level = "trace", skip(prog))]
@@ -1296,13 +1296,13 @@ pub fn iop_if_then_else(prog: &mut Program) {
        });
 }

-/// Implement erc_7984 fund xfer
+/// Implement erc_20 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
 /// 3. Compute new amount (from - new_amount, to + new_amount)
 #[instrument(level = "info", skip(prog))]
-pub fn iop_erc_7984(prog: &mut Program) {
+pub fn iop_erc_20(prog: &mut Program) {
    // Allocate metavariables:
    // Dest -> Operand
    let mut dst_from = prog.iop_template_var(OperandKind::Dst, 0);
@@ -1314,7 +1314,7 @@ pub fn iop_erc_7984(prog: &mut Program) {
    let src_amount = prog.iop_template_var(OperandKind::Src, 2);

    // Add Comment header
-    prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());

    let props = prog.params();
    let tfhe_params: asm::DigitParameters = props.clone().into();
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -70,7 +70,7 @@ crate::impl_fw!("Llt" [
    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;

-    ERC_7984 => fw_impl::llt::iop_erc_7984;
+    ERC_20 => fw_impl::llt::iop_erc_20;
    MEMCPY => fw_impl::ilp::iop_memcpy;

    COUNT0 => fw_impl::ilp_log::iop_count0;
@@ -83,7 +83,7 @@ crate::impl_fw!("Llt" [

    // SIMD Implementations
    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_7984_SIMD  => fw_impl::llt::iop_erc_7984_simd;
+    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
 ]);

 // ----------------------------------------------------------------------------
@@ -225,24 +225,24 @@ pub fn iop_muls(prog: &mut Program) {
 }

 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_7984(prog: &mut Program) {
+pub fn iop_erc_20(prog: &mut Program) {
    // Add Comment header
-    prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
    // TODO: Make sweep of kogge_blk_w
    // All these little parameters would be very handy to write an
    // exploration/compilation program which would try to minimize latency by
    // playing with these.
-    iop_erc_7984_rtl(prog, 0, Some(10)).add_to_prog(prog);
+    iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
 }

 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_7984_simd(prog: &mut Program) {
+pub fn iop_erc_20_simd(prog: &mut Program) {
    // Add Comment header
-    prog.push_comment("ERC_7984_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
+    prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
    simd(
        prog,
        crate::asm::iop::SIMD_N,
-        fw_impl::llt::iop_erc_7984_rtl,
+        fw_impl::llt::iop_erc_20_rtl,
        None,
    );
 }
@@ -379,7 +379,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 // Helper Functions
 // ----------------------------------------------------------------------------

-/// Implement erc_7984 fund xfer
+/// Implement erc_20 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
@@ -391,7 +391,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 ///     (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
 /// Where N is the batch size
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_7984_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
+pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
    // Allocate metavariables:
    // Dest -> Operand
    let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
--- a/backends/zk-cuda-backend/Cargo.toml
+++ b/backends/zk-cuda-backend/Cargo.toml
@@ -24,7 +24,7 @@ bindgen.workspace = true
 [dependencies]
 ark-ec.workspace = true
 ark-ff.workspace = true
-tfhe-cuda-backend = { version = "0.14.0", path = "../tfhe-cuda-backend" }
+tfhe-cuda-backend = { version = "=0.14.0", path = "../tfhe-cuda-backend" }

 [features]
 default = []
--- a/backends/zk-cuda-backend/cuda/include/msm.h
+++ b/backends/zk-cuda-backend/cuda/include/msm.h
@@ -97,23 +97,28 @@ size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index);
 //   d_scalars: Device pointer to input BigInt scalars (array of n scalars)
 //   n: Number of points/scalars
 //   d_scratch: Caller-provided device scratch buffer for intermediate results
+//   size_tracker: Reference for tracking GPU memory allocation sizes
 void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
                        G1Projective *h_result, const G1Affine *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G1Projective *d_scratch);
+                        G1Projective *d_scratch, uint64_t &size_tracker,
+                        bool gpu_memory_allocated);

 void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
                  G1Projective *h_result, const G1Affine *d_points,
-                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch);
+                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
+                  uint64_t &size_tracker, bool gpu_memory_allocated);

 // MSM for G2 points with BigInt scalars (projective result)
 // Result is written directly to a host pointer.
 void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
                        G2ProjectivePoint *h_result, const G2Point *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G2ProjectivePoint *d_scratch);
+                        G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
+                        bool gpu_memory_allocated);

 void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
                  G2ProjectivePoint *h_result, const G2Point *d_points,
                  const Scalar *d_scalars, uint32_t n,
-                  G2ProjectivePoint *d_scratch);
+                  G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
+                  bool gpu_memory_allocated);
--- a/backends/zk-cuda-backend/cuda/src/msm/msm.cu
+++ b/backends/zk-cuda-backend/cuda/src/msm/msm.cu
@@ -8,16 +8,17 @@
 // Multi-Scalar Multiplication (MSM) using Pippenger algorithm for BLS12-446

 // Forward declarations for Pippenger implementations
-void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
-                                  G1Projective *h_result,
-                                  const G1Affine *d_points,
-                                  const Scalar *d_scalars, uint32_t n,
-                                  G1Projective *d_scratch);
+void point_msm_g1_pippenger_async(
+    cudaStream_t stream, uint32_t gpu_index, G1Projective *h_result,
+    const G1Affine *d_points, const Scalar *d_scalars, uint32_t n,
+    G1Projective *d_scratch, uint64_t &size_tracker, bool gpu_memory_allocated);
 void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
                                  G2ProjectivePoint *h_result,
                                  const G2Point *d_points,
                                  const Scalar *d_scalars, uint32_t n,
-                                  G2ProjectivePoint *d_scratch);
+                                  G2ProjectivePoint *d_scratch,
+                                  uint64_t &size_tracker,
+                                  bool gpu_memory_allocated);

 // ============================================================================
 // Public MSM API for BigInt scalars
@@ -28,9 +29,11 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
 void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
                        G1Projective *h_result, const G1Affine *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G1Projective *d_scratch) {
+                        G1Projective *d_scratch, uint64_t &size_tracker,
+                        bool gpu_memory_allocated) {
  point_msm_g1_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
-                               n, d_scratch);
+                               n, d_scratch, size_tracker,
+                               gpu_memory_allocated);
 }

 // MSM with BigInt scalars for G2 (projective coordinates internally)
@@ -38,17 +41,19 @@ void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
 void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
                        G2ProjectivePoint *h_result, const G2Point *d_points,
                        const Scalar *d_scalars, uint32_t n,
-                        G2ProjectivePoint *d_scratch) {
+                        G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
+                        bool gpu_memory_allocated) {
  point_msm_g2_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
-                               n, d_scratch);
+                               n, d_scratch, size_tracker,
+                               gpu_memory_allocated);
 }

 void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
                  G1Projective *h_result, const G1Affine *d_points,
-                  const Scalar *d_scalars, uint32_t n,
-                  G1Projective *d_scratch) {
+                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
+                  uint64_t &size_tracker, bool gpu_memory_allocated) {
  point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                     d_scratch);
+                     d_scratch, size_tracker, gpu_memory_allocated);
  // The async impl already syncs internally before the CPU-side Horner phase,
  // so the stream is idle here. This sync is kept for defensive correctness.
  cuda_synchronize_stream(stream, gpu_index);
@@ -57,9 +62,10 @@ void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
 void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
                  G2ProjectivePoint *h_result, const G2Point *d_points,
                  const Scalar *d_scalars, uint32_t n,
-                  G2ProjectivePoint *d_scratch) {
+                  G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
+                  bool gpu_memory_allocated) {
  point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                     d_scratch);
+                     d_scratch, size_tracker, gpu_memory_allocated);
  // See comment in point_msm_g1 above.
  cuda_synchronize_stream(stream, gpu_index);
 }
--- a/backends/zk-cuda-backend/cuda/src/msm/pippenger/msm_pippenger.cu
+++ b/backends/zk-cuda-backend/cuda/src/msm/pippenger/msm_pippenger.cu
@@ -493,13 +493,12 @@ void horner_combine_cpu(ProjectiveType &result,
 // window sums. The caller is responsible for allocating and freeing this
 // buffer.
 template <typename AffineType, typename ProjectiveType>
-void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
-                                    ProjectiveType *h_result,
-                                    const AffineType *d_points,
-                                    const Scalar *d_scalars, uint32_t n,
-                                    uint32_t threads_per_block,
-                                    uint32_t window_size, uint32_t bucket_count,
-                                    ProjectiveType *d_scratch) {
+void point_msm_pippenger_impl_async(
+    cudaStream_t stream, uint32_t gpu_index, ProjectiveType *h_result,
+    const AffineType *d_points, const Scalar *d_scalars, uint32_t n,
+    uint32_t threads_per_block, uint32_t window_size, uint32_t bucket_count,
+    ProjectiveType *d_scratch, uint64_t &size_tracker,
+    bool gpu_memory_allocated) {
  using ProjectivePoint = Projective<ProjectiveType>;

  if (n == 0) {
@@ -706,13 +705,16 @@ void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
                                  G1Projective *h_result,
                                  const G1Affine *d_points,
                                  const Scalar *d_scalars, uint32_t n,
-                                  G1Projective *d_scratch) {
+                                  G1Projective *d_scratch,
+                                  uint64_t &size_tracker,
+                                  bool gpu_memory_allocated) {
  uint32_t window_size, bucket_count;
  get_g1_window_params(n, window_size, bucket_count);

  point_msm_pippenger_impl_async<G1Affine, G1Projective>(
      stream, gpu_index, h_result, d_points, d_scalars, n,
-      msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch);
+      msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch,
+      size_tracker, gpu_memory_allocated);
 }

 // MSM with BigInt scalars for G2 (projective coordinates internally)
@@ -722,11 +724,14 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
                                  G2ProjectivePoint *h_result,
                                  const G2Point *d_points,
                                  const Scalar *d_scalars, uint32_t n,
-                                  G2ProjectivePoint *d_scratch) {
+                                  G2ProjectivePoint *d_scratch,
+                                  uint64_t &size_tracker,
+                                  bool gpu_memory_allocated) {
  uint32_t window_size, bucket_count;
  get_g2_window_params(n, window_size, bucket_count);

  point_msm_pippenger_impl_async<G2Point, G2ProjectivePoint>(
      stream, gpu_index, h_result, d_points, d_scalars, n,
-      msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch);
+      msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch,
+      size_tracker, gpu_memory_allocated);
 }
--- a/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
+++ b/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
@@ -187,82 +187,37 @@ __host__ __device__ void fp_copy(Fp &dst, const Fp &src) {
 // "Raw" means without modular reduction - performs a + b and returns carry.
 // This is an internal helper used by fp_add() which handles reduction.
 __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
-#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
-  // PTX carry-chain: add.cc sets the hardware carry flag, addc.cc propagates
-  // it. This replaces 2 software carry-detect comparisons per limb (~14 extra
-  // instructions across 7 limbs) with zero-cost hardware flag propagation.
-  uint64_t carry_out;
-  asm("add.cc.u64   %0,  %8,  %15;\n\t" // c[0] = a[0] + b[0], set CF
-      "addc.cc.u64  %1,  %9,  %16;\n\t" // c[1] = a[1] + b[1] + CF
-      "addc.cc.u64  %2,  %10, %17;\n\t" // c[2] = a[2] + b[2] + CF
-      "addc.cc.u64  %3,  %11, %18;\n\t" // c[3] = a[3] + b[3] + CF
-      "addc.cc.u64  %4,  %12, %19;\n\t" // c[4] = a[4] + b[4] + CF
-      "addc.cc.u64  %5,  %13, %20;\n\t" // c[5] = a[5] + b[5] + CF
-      "addc.cc.u64  %6,  %14, %21;\n\t" // c[6] = a[6] + b[6] + CF
-      "addc.u64     %7,  0,   0;\n\t"   // carry_out = 0 + 0 + CF
-      : "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
-        "=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(carry_out)
-      : "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
-        "l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
-        "l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
-        "l"(b.limb[5]), "l"(b.limb[6]));
-  return carry_out;
-#else
-  // Host path: portable software carry detection
  UNSIGNED_LIMB carry = 0;

  for (int i = 0; i < FP_LIMBS; i++) {
+    // Add with carry: c = a + b + carry
    UNSIGNED_LIMB sum = a.limb[i] + carry;
-    carry = (sum < a.limb[i]) ? 1 : 0;
+    carry = (sum < a.limb[i]) ? 1 : 0; // Check for overflow
    sum += b.limb[i];
-    carry += (sum < b.limb[i]) ? 1 : 0;
+    carry += (sum < b.limb[i]) ? 1 : 0; // Check for overflow
    c.limb[i] = sum;
  }

  return carry;
-#endif
 }

 // Subtraction with borrow propagation
 // "Raw" means without modular reduction - performs a - b and returns borrow.
 // This is an internal helper used by fp_sub() which handles reduction.
 __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
-#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
-  // PTX borrow-chain: sub.cc sets the hardware borrow flag, subc.cc propagates
-  // it. Same benefit as fp_add_raw -- eliminates 2 comparisons per limb.
-  uint64_t borrow_out;
-  asm("sub.cc.u64   %0,  %8,  %15;\n\t" // c[0] = a[0] - b[0], set CF
-      "subc.cc.u64  %1,  %9,  %16;\n\t" // c[1] = a[1] - b[1] - CF
-      "subc.cc.u64  %2,  %10, %17;\n\t" // c[2] = a[2] - b[2] - CF
-      "subc.cc.u64  %3,  %11, %18;\n\t" // c[3] = a[3] - b[3] - CF
-      "subc.cc.u64  %4,  %12, %19;\n\t" // c[4] = a[4] - b[4] - CF
-      "subc.cc.u64  %5,  %13, %20;\n\t" // c[5] = a[5] - b[5] - CF
-      "subc.cc.u64  %6,  %14, %21;\n\t" // c[6] = a[6] - b[6] - CF
-      "subc.u64     %7,  0,   0;\n\t"   // borrow_out = 0 - 0 - CF
-      : "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
-        "=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(borrow_out)
-      : "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
-        "l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
-        "l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
-        "l"(b.limb[5]), "l"(b.limb[6]));
-  // subc.u64 with 0-0-CF produces 0 if no borrow, or 0xFFFFFFFFFFFFFFFF if
-  // borrow. Normalize to 0/1 for callers that check (borrow != 0) or add it.
-  return borrow_out & 1;
-#else
-  // Host path: portable software borrow detection
  UNSIGNED_LIMB borrow = 0;

  for (int i = 0; i < FP_LIMBS; i++) {
+    // Subtract with borrow: c = a - b - borrow
    UNSIGNED_LIMB diff = a.limb[i] - borrow;
-    borrow = (diff > a.limb[i]) ? 1 : 0;
+    borrow = (diff > a.limb[i]) ? 1 : 0; // Check for underflow
    UNSIGNED_LIMB old_diff = diff;
    diff -= b.limb[i];
-    borrow += (diff > old_diff) ? 1 : 0;
+    borrow += (diff > old_diff) ? 1 : 0; // Check for underflow
    c.limb[i] = diff;
  }

  return borrow;
-#endif
 }

 // Addition with modular reduction: c = (a + b) mod p
@@ -271,27 +226,7 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
  Fp sum;
  UNSIGNED_LIMB carry = fp_add_raw(sum, a, b);

-#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
-  // Branchless reduction: always compute sum - p, then select based on
-  // whether reduction was needed. This avoids divergent branches that stall
-  // warps when some threads need reduction and others don't.
-  //
-  // Decision logic:
-  //   carry=1 -> sum overflowed 448 bits, definitely >= p -> use reduced
-  //   carry=0, borrow=0 -> sum >= p in 448 bits -> use reduced
-  //   carry=0, borrow=1 -> sum < p -> use original sum
-  // So: use_original = (!carry) & borrow
-  Fp reduced;
-  UNSIGNED_LIMB borrow = fp_sub_raw(reduced, sum, fp_modulus());
-  UNSIGNED_LIMB use_original = ((carry ^ 1) & borrow);
-  UNSIGNED_LIMB mask =
-      -use_original; // all-ones if keep sum, all-zeros if keep reduced
-
-  for (int i = 0; i < FP_LIMBS; i++) {
-    c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
-  }
-#else
-  // Host path: branching is fine on CPU (branch predictor handles it well)
+  // If there's a carry or sum >= MODULUS, we need to reduce
  const Fp &p = fp_modulus();
  if (carry || fp_cmp(sum, p) != ComparisonType::Less) {
    Fp reduced;
@@ -300,7 +235,6 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
  } else {
    fp_copy(c, sum);
  }
-#endif
 }

 // Subtraction with modular reduction: c = (a - b) mod p
@@ -309,28 +243,13 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
  Fp diff;
  UNSIGNED_LIMB borrow = fp_sub_raw(diff, a, b);

-#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
-  // Branchless correction: always compute diff + p, select based on borrow.
-  // Same rationale as fp_add -- avoids warp divergence.
-  //   borrow=1 -> a < b, need to add p -> use corrected
-  //   borrow=0 -> a >= b, result is valid -> use diff
-  Fp corrected;
-  fp_add_raw(corrected, diff, fp_modulus());
-  UNSIGNED_LIMB mask =
-      -borrow; // all-ones if borrow (use corrected), all-zeros if not
-
-  for (int i = 0; i < FP_LIMBS; i++) {
-    c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
-  }
-#else
-  // Host path: branching is fine on CPU
+  // If there was a borrow, we need to add MODULUS
  const Fp &p = fp_modulus();
  if (borrow) {
    fp_add_raw(c, diff, p);
  } else {
    fp_copy(c, diff);
  }
-#endif
 }

 // Small-constant multiplication via addition chains.
@@ -534,223 +453,23 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
  }
 }

-// ============================================================================
-// PTX-accelerated CIOS Montgomery multiplication (device path)
-// ============================================================================
-// The CIOS algorithm for 7 x 64-bit limbs executes 98 multiply-accumulate
-// steps across 7 outer iterations. Each step computes:
-//   (carry, t[j]) = t[j] + a[j] * b_i + carry
-// which is a 64x64->128 multiply plus a three-operand addition with carry.
-//
-// The C++ path uses software carry detection: carry = (sum < old) ? 1 : 0.
-// The PTX path below uses hardware carry flags via the .cc suffix:
-//   - mul.lo.u64 / mul.hi.u64 : 64x64->128 wide multiply
-//   - add.cc.u64 / addc.u64   : addition chain with hardware carry flag
-//
-// Each multiply-accumulate step uses 6 PTX instructions instead of ~10+ in
-// the software-carry version. The 7 outer iterations are fully unrolled, and
-// the limb-shift loop (t[j] = t[j+1]) is eliminated by register renaming.
-//
-// REGISTER ALIASING NOTE: All PTX temporaries (_lo, _hi) are declared as
-// .reg inside the asm block. This prevents nvcc's register allocator from
-// aliasing them with C operands (t_j, carry), which was the root cause of
-// previous correctness bugs where "+l" outputs could share registers with
-// "l" inputs in the same asm statement.
-// ============================================================================
-
-#ifdef __CUDA_ARCH__
-#if LIMB_BITS_CONFIG == 64
-
-// Multiply-accumulate one limb: (carry_out, t_j) = t_j + a_j * b_i + carry_in
-//
-// All intermediates (_lo, _hi) are PTX .reg temporaries inside a { } scope
-// block to avoid: (1) nvcc register aliasing between C operands, and (2)
-// duplicate .reg definitions when the macro is expanded multiple times.
-// The 6-instruction sequence:
-//   mul.lo.u64  _lo, a_j, b_i      -- low 64 bits of product
-//   mul.hi.u64  _hi, a_j, b_i      -- high 64 bits of product
-//   add.cc.u64  t_j, t_j, _lo      -- t_j += _lo, set CF
-//   addc.u64    _hi, _hi, 0        -- _hi += CF
-//   add.cc.u64  t_j, t_j, carry    -- t_j += carry_in, set CF
-//   addc.u64    carry, _hi, 0      -- carry_out = _hi + CF
-#define LIMB_MACC(t_j, carry, a_j, b_i)                                        \
-  asm volatile("{\n\t"                                                         \
-               ".reg .u64 _lo, _hi;\n\t"                                       \
-               "mul.lo.u64  _lo, %2, %3;\n\t"                                  \
-               "mul.hi.u64  _hi, %2, %3;\n\t"                                  \
-               "add.cc.u64  %0, %0, _lo;\n\t"                                  \
-               "addc.u64    _hi, _hi, 0;\n\t"                                  \
-               "add.cc.u64  %0, %0, %1;\n\t"                                   \
-               "addc.u64    %1, _hi, 0;\n\t"                                   \
-               "}\n\t"                                                         \
-               : "+l"(t_j), "+l"(carry)                                        \
-               : "l"(a_j), "l"(b_i))
-
-// Single CIOS iteration: multiply-accumulate, reduce, and shift.
-//
-// Computes:
-//   1. t += a * b_i  (7 limb multiply-accumulate with carry chain)
-//   2. m = t[0] * p_prime  (Montgomery reduction factor)
-//   3. t += m * p  (reduction, zeros out t[0])
-//   4. Shift t right by one limb (via register renaming into r0..r7)
-//
-// The macro lets the compiler allocate registers across all 7 unrolled
-// iterations, avoiding spills to local memory.
-#define CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, \
-                           a5, a6, b_i, p0, p1, p2, p3, p4, p5, p6, p_prime,   \
-                           r0, r1, r2, r3, r4, r5, r6, r7)                     \
-  do {                                                                         \
-    uint64_t _carry = 0;                                                       \
-    /* Step 1: t += a * b_i */                                                 \
-    LIMB_MACC(t0, _carry, a0, b_i);                                            \
-    LIMB_MACC(t1, _carry, a1, b_i);                                            \
-    LIMB_MACC(t2, _carry, a2, b_i);                                            \
-    LIMB_MACC(t3, _carry, a3, b_i);                                            \
-    LIMB_MACC(t4, _carry, a4, b_i);                                            \
-    LIMB_MACC(t5, _carry, a5, b_i);                                            \
-    LIMB_MACC(t6, _carry, a6, b_i);                                            \
-    /* Accumulate final carry into overflow limb t7 */                         \
-    uint64_t _overflow;                                                        \
-    asm("add.cc.u64  %0, %0, %2;\n\t"                                          \
-        "addc.u64    %1, 0, 0;\n\t"                                            \
-        : "+l"(t7), "=l"(_overflow)                                            \
-        : "l"(_carry));                                                        \
-                                                                               \
-    /* Step 2: m = t0 * p_prime mod 2^64 */                                    \
-    uint64_t _m = t0 * p_prime;                                                \
-                                                                               \
-    /* Step 3: t += m * p (zeros out t0) */                                    \
-    _carry = 0;                                                                \
-    LIMB_MACC(t0, _carry, _m, p0);                                             \
-    LIMB_MACC(t1, _carry, _m, p1);                                             \
-    LIMB_MACC(t2, _carry, _m, p2);                                             \
-    LIMB_MACC(t3, _carry, _m, p3);                                             \
-    LIMB_MACC(t4, _carry, _m, p4);                                             \
-    LIMB_MACC(t5, _carry, _m, p5);                                             \
-    LIMB_MACC(t6, _carry, _m, p6);                                             \
-    /* Finalize overflow: t7 = t7 + _carry + _overflow */                      \
-    /* Plain adds (no carry chain) -- the CIOS invariant guarantees this */    \
-    /* sum fits in 64 bits so intermediate overflow does not matter. */        \
-    t7 += _carry;                                                              \
-    t7 += _overflow;                                                           \
-                                                                               \
-    /* Step 4: Shift right by one limb via register renaming */                \
-    /* t0 is now zero (by construction of m), discard it */                    \
-    r0 = t1;                                                                   \
-    r1 = t2;                                                                   \
-    r2 = t3;                                                                   \
-    r3 = t4;                                                                   \
-    r4 = t5;                                                                   \
-    r5 = t6;                                                                   \
-    r6 = t7;                                                                   \
-    r7 = 0;                                                                    \
-  } while (0)
-
-__device__ __noinline__ void fp_mont_mul_cios_ptx(Fp &c, const Fp &a,
-                                                  const Fp &b) {
-  const uint64_t p0 = DEVICE_MODULUS.limb[0];
-  const uint64_t p1 = DEVICE_MODULUS.limb[1];
-  const uint64_t p2 = DEVICE_MODULUS.limb[2];
-  const uint64_t p3 = DEVICE_MODULUS.limb[3];
-  const uint64_t p4 = DEVICE_MODULUS.limb[4];
-  const uint64_t p5 = DEVICE_MODULUS.limb[5];
-  const uint64_t p6 = DEVICE_MODULUS.limb[6];
-  const uint64_t pp = DEVICE_P_PRIME;
-
-  const uint64_t a0 = a.limb[0], a1 = a.limb[1], a2 = a.limb[2];
-  const uint64_t a3 = a.limb[3], a4 = a.limb[4], a5 = a.limb[5];
-  const uint64_t a6 = a.limb[6];
-
-  // Accumulator: 7 limbs + 1 overflow, initialized to zero
-  uint64_t t0 = 0, t1 = 0, t2 = 0, t3 = 0;
-  uint64_t t4 = 0, t5 = 0, t6 = 0, t7 = 0;
-
-  // 7 fully-unrolled CIOS iterations with register renaming for the shift.
-  // Each iteration processes one limb of b, accumulates a*b[i], reduces,
-  // and shifts. The output registers become the input for the next iteration.
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[0], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[1], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[2], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[3], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[4], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[5], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
-                     b.limb[6], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
-                     t4, t5, t6, t7);
-
-  // Final reduction: if t[0..7] >= p (extended to 8 limbs), subtract p.
-  // Compute (t[0..6] - p[0..6]) with borrow, then subtract borrow from t7.
-  // If t7 after subtraction is non-negative, the reduced result is valid;
-  // otherwise the original t[0..6] is already in [0, p).
-  uint64_t r0, r1, r2, r3, r4, r5, r6, mask;
-  asm("sub.cc.u64   %0, %8,  %15;\n\t" // r0 = t0 - p0
-      "subc.cc.u64  %1, %9,  %16;\n\t" // r1 = t1 - p1 - borrow
-      "subc.cc.u64  %2, %10, %17;\n\t" // r2 = t2 - p2 - borrow
-      "subc.cc.u64  %3, %11, %18;\n\t" // r3 = t3 - p3 - borrow
-      "subc.cc.u64  %4, %12, %19;\n\t" // r4 = t4 - p4 - borrow
-      "subc.cc.u64  %5, %13, %20;\n\t" // r5 = t5 - p5 - borrow
-      "subc.cc.u64  %6, %14, %21;\n\t" // r6 = t6 - p6 - borrow
-      "subc.u64     %7, %22, 0;\n\t"   // mask_src = t7 - 0 - borrow
-      "shr.s64      %7, %7, 63;\n\t" // mask = sign-extend: -1 if negative, 0 if
-                                     // >= 0
-      : "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
-        "=l"(mask)
-      : "l"(t0), "l"(t1), "l"(t2), "l"(t3), "l"(t4), "l"(t5), "l"(t6), "l"(p0),
-        "l"(p1), "l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6), "l"(t7));
-
-  // Branchless selection:
-  //   mask = 0  -> t >= p (use reduced r[0..6])
-  //   mask = -1 -> t < p  (keep original t[0..6])
-  c.limb[0] = (t0 & mask) | (r0 & ~mask);
-  c.limb[1] = (t1 & mask) | (r1 & ~mask);
-  c.limb[2] = (t2 & mask) | (r2 & ~mask);
-  c.limb[3] = (t3 & mask) | (r3 & ~mask);
-  c.limb[4] = (t4 & mask) | (r4 & ~mask);
-  c.limb[5] = (t5 & mask) | (r5 & ~mask);
-  c.limb[6] = (t6 & mask) | (r6 & ~mask);
-}
-
-#undef LIMB_MACC
-#undef CIOS_ITERATION_PTX
-
-#endif // LIMB_BITS_CONFIG == 64
-#endif // __CUDA_ARCH__
-
 // CIOS (Coarsely Integrated Operand Scanning) Montgomery multiplication
 // Fuses multiplication and reduction in a single pass for better efficiency.
 // Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
 // Both a and b are in Montgomery form, result is in Montgomery form.
 __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
-#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
-  // Device path: fully unrolled PTX with hardware carry flags
-  fp_mont_mul_cios_ptx(c, a, b);
-#else
-  // Host path: portable C++ implementation
  const Fp &p = fp_modulus();
  UNSIGNED_LIMB p_prime = fp_p_prime();

  // Working array: only n+1 limbs needed (vs 2n for separate mul+reduce)
  UNSIGNED_LIMB t[FP_LIMBS + 1];
+#ifdef __CUDA_ARCH__
+  for (int i = 0; i < FP_LIMBS + 1; i++) {
+    t[i] = 0;
+  }
+#else
  memset(t, 0, (FP_LIMBS + 1) * sizeof(UNSIGNED_LIMB));
+#endif

  // Main CIOS loop: for each limb of b
  for (int i = 0; i < FP_LIMBS; i++) {
@@ -810,7 +529,14 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
  }

  // Copy result to output
+#ifdef __CUDA_ARCH__
+#pragma unroll
+  for (int i = 0; i < FP_LIMBS; i++) {
+    c.limb[i] = t[i];
+  }
+#else
  memcpy(&c.limb[0], t, FP_LIMBS * sizeof(UNSIGNED_LIMB));
+#endif

  // Final reduction: if result >= p or there's overflow, subtract p
  if (t[FP_LIMBS] != 0 || fp_cmp(c, p) != ComparisonType::Less) {
@@ -819,7 +545,6 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
    fp_copy(c, reduced);
  }
  // Result is in Montgomery form
-#endif
 }

 // Montgomery multiplication: c = (a * b * R_INV) mod p
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
@@ -23,8 +23,7 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)

 # Build device library from tfhe-cuda-backend
-add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
-                                     ${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
+add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
 set_target_properties(
  tfhe_device_bench
  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_msm.cu
@@ -140,14 +140,14 @@ static void BM_G1_MSM(benchmark::State &state) {
  // Warm-up iterations
  for (int i = 0; i < WARMUP_ITERATIONS; i++) {
    point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch);
+                       d_scalars, n, d_scratch, size_tracker, true);
  }
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

  // Benchmark loop: only measure the MSM computation, no memory operations
  for (auto _ : state) {
    point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch);
+                       d_scalars, n, d_scratch, size_tracker, true);
    benchmark::ClobberMemory();
  }

@@ -221,14 +221,14 @@ static void BM_G2_MSM(benchmark::State &state) {
  // Warm-up iterations
  for (int i = 0; i < WARMUP_ITERATIONS; i++) {
    point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch);
+                       d_scalars, n, d_scratch, size_tracker, true);
  }
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

  // Benchmark loop: only measure the MSM computation, no memory operations
  for (auto _ : state) {
    point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch);
+                       d_scalars, n, d_scratch, size_tracker, true);
    benchmark::ClobberMemory();
  }

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
@@ -20,8 +20,7 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)

 # Build device library from tfhe-cuda-backend
-add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
-                               ${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
+add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
 set_target_properties(
  tfhe_device
  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_curve_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_curve_ops.cu
@@ -13,8 +13,8 @@
 //   ./build/tests_and_benchmarks/tests/basic/basic_curve_ops

 #include "curve.h"
-#include "device.h"
 #include "fp.h"
+#include <cassert>
 #include <cstdio>
 #include <cstring>

@@ -24,7 +24,7 @@ int main() {
  // (non-Montgomery) form. Convert to Montgomery, then lift to projective for
  // host-side arithmetic.
  const G1Affine &gen_normal = g1_generator();
-  PANIC_IF_FALSE(!g1_is_infinity(gen_normal), "generator must not be infinity");
+  assert(!g1_is_infinity(gen_normal));

  G1Affine gen_affine = gen_normal;
  point_to_montgomery_inplace(gen_affine);
@@ -37,21 +37,21 @@ int main() {

  // G + (-G) = identity (Z = 0 in the projective convention)
  G1Projective identity = G + neg_G;
-  PANIC_IF_FALSE(fp_is_zero(identity.Z), "G + (-G) must be identity (Z = 0)");
+  assert(fp_is_zero(identity.Z));
  printf("Negation (-G) and G + (-G) = identity: OK\n");

  // ---- Addition: 2*G = G + G, 3*G = 2*G + G ----
  G1Projective two_G = G + G;
-  PANIC_IF_FALSE(!(two_G == G1Projective{}), "2*G must not be identity");
+  assert(!(two_G == G1Projective())); // not the identity

  G1Projective three_G = two_G + G;
-  PANIC_IF_FALSE(!(three_G == G1Projective{}), "3*G must not be identity");
+  assert(!(three_G == G1Projective()));
  printf("Addition (2*G, 3*G): OK\n");

  // ---- Compound assignment: G += G ----
  G1Projective acc = G;
  acc += G; // acc = 2*G
-  PANIC_IF_FALSE(acc == two_G, "G += G must equal 2*G");
+  assert(acc == two_G);
  printf("Compound assignment (+=): OK\n");

  // ---- Scalar multiplication: 3*G using Scalar type ----
@@ -61,22 +61,19 @@ int main() {
  scalar_3.limb[0] = 3;

  G1Projective three_G_via_scalar = G * scalar_3;
-  PANIC_IF_FALSE(!(three_G_via_scalar == G1Projective{}),
-                 "3*G via scalar must not be identity");
+  assert(!(three_G_via_scalar == G1Projective()));

  // Normalise both to Z = 1 (Montgomery) before comparing coordinates.
  normalize_projective_g1(three_G);
  normalize_projective_g1(three_G_via_scalar);
-  PANIC_IF_FALSE(three_G == three_G_via_scalar,
-                 "3*G via addition must equal 3*G via scalar multiply");
+  assert(three_G == three_G_via_scalar);
  printf("Scalar multiplication (3*G == G + G + G): OK\n");

  // ---- Projective -> affine conversion ----
  // projective_to_affine_g1 keeps coordinates in Montgomery form.
  G1Affine three_G_affine;
  projective_to_affine_g1(three_G_affine, three_G);
-  PANIC_IF_FALSE(!g1_is_infinity(three_G_affine),
-                 "3*G in affine must not be infinity");
+  assert(!g1_is_infinity(three_G_affine));
  printf("Projective -> affine conversion: OK\n");

  // ---- Convert to normal-form coordinates ----
@@ -85,8 +82,7 @@ int main() {
  G1Projective result = three_G_via_scalar;
  normalize_from_montgomery_g1(
      result); // coordinates now in normal (non-Montgomery) form
-  PANIC_IF_FALSE(!fp_is_zero(result.Z),
-                 "normalized result must have non-zero Z");
+  assert(!fp_is_zero(result.Z)); // Z = 1 (non-zero)
  printf("Conversion to normal-form projective: OK\n");

  printf("All G1 curve operations passed.\n");
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_fp_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_fp_ops.cu
@@ -11,8 +11,8 @@
 //   cmake --build build --target basic_fp_ops
 //   ./build/tests_and_benchmarks/tests/basic/basic_fp_ops

-#include "device.h"
 #include "fp.h"
+#include <cassert>
 #include <cstdio>

 int main() {
@@ -25,16 +25,16 @@ int main() {
  fp_one(b); // b = 1

  c = a + b; // c = 2
-  PANIC_IF_FALSE(c.limb[0] == 2, "1 + 1 must equal 2");
+  assert(c.limb[0] == 2);

  c = c - a; // c = 1
-  PANIC_IF_FALSE(fp_is_one(c), "2 - 1 must equal 1");
+  assert(fp_is_one(c));

  // Compound assignment
  c += a; // c = 2
-  PANIC_IF_FALSE(c.limb[0] == 2, "1 += 1 must equal 2");
+  assert(c.limb[0] == 2);
  c -= b; // c = 1
-  PANIC_IF_FALSE(fp_is_one(c), "2 -= 1 must equal 1");
+  assert(fp_is_one(c));

  printf("Addition/subtraction: OK\n");

@@ -43,7 +43,7 @@ int main() {
  // form, but for add/sub/neg small normal-form values also work correctly.
  Fp neg_a = -a; // neg_a = -1 mod p
  Fp sum = a + neg_a;
-  PANIC_IF_FALSE(fp_is_zero(sum), "1 + (-1) must equal 0");
+  assert(fp_is_zero(sum)); // 1 + (-1) = 0
  printf("Negation: OK\n");

  // ---- Multiplication (Montgomery form required) ----
@@ -56,17 +56,17 @@ int main() {

  result_m = one_m * two_m; // result_m = 2 in Montgomery form
  fp_from_montgomery(result, result_m);
-  PANIC_IF_FALSE(result.limb[0] == 2, "1 * 2 must equal 2");
+  assert(result.limb[0] == 2);

  result_m = two_m * two_m; // result_m = 4 in Montgomery form
  fp_from_montgomery(result, result_m);
-  PANIC_IF_FALSE(result.limb[0] == 4, "2 * 2 must equal 4");
+  assert(result.limb[0] == 4);

  // Compound multiplication
  result_m = two_m;
  result_m *= two_m; // result_m = 4
  fp_from_montgomery(result, result_m);
-  PANIC_IF_FALSE(result.limb[0] == 4, "2 *= 2 must equal 4");
+  assert(result.limb[0] == 4);

  // Convert an arbitrary normal-form value to Montgomery before multiplying
  Fp five_normal, five_m, twenty_five_m, twenty_five;
@@ -76,7 +76,7 @@ int main() {

  fp_mont_mul(twenty_five_m, five_m, five_m); // 5 * 5 = 25
  fp_from_montgomery(twenty_five, twenty_five_m);
-  PANIC_IF_FALSE(twenty_five.limb[0] == 25, "5 * 5 must equal 25");
+  assert(twenty_five.limb[0] == 25);

  printf("Multiplication: OK\n");

@@ -88,7 +88,7 @@ int main() {

  Fp one_check;
  fp_div(one_check, five_normal, five_normal); // 5 / 5 = 1
-  PANIC_IF_FALSE(fp_is_one(one_check), "5 / 5 must equal 1");
+  assert(fp_is_one(one_check));

  // Verify: 5 * 5^{-1} == 1  (using fp_div as a cross-check)
  Fp product;
@@ -98,7 +98,7 @@ int main() {
  fp_zero(two_normal);
  two_normal.limb[0] = 2;
  fp_div(product, two_normal, two_normal); // 2 / 2 = 1
-  PANIC_IF_FALSE(fp_is_one(product), "2 / 2 must equal 1");
+  assert(fp_is_one(product));

  printf("Inversion/division: OK\n");

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_msm.cu
@@ -20,6 +20,7 @@
 #include "device.h"
 #include "fp.h"
 #include "msm.h"
+#include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <vector>
@@ -32,6 +33,7 @@ int main() {

  const uint32_t gpu_index = 0;
  const uint32_t n = 4; // number of points / scalars
+  uint64_t size_tracker = 0;

  // ---- Prepare host-side points in Montgomery form ----
  // Use n doublings of the G1 generator: G, 2*G, 4*G, 8*G.
@@ -74,7 +76,8 @@ int main() {

  // ---- Run MSM (synchronous wrapper; result written directly to host) ----
  G1Projective h_result;
-  point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch);
+  point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch,
+               size_tracker, true);

  // ---- Verify against naive sequential computation on the host ----
  // Expected = sum over i of (scalar[i] * point[i]).
@@ -92,8 +95,7 @@ int main() {
  // Normalise to Z = 1 (Montgomery) before comparing projective coordinates.
  normalize_projective_g1(h_result);
  normalize_projective_g1(expected);
-  PANIC_IF_FALSE(h_result == expected,
-                 "MSM result must match naive sequential computation");
+  assert(h_result == expected);
  printf("MSM result matches naive sequential computation.\n");

  // ---- Cleanup ----
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp.cu
@@ -3,7 +3,6 @@
 #include "fp.h"
 #include "fp_helpers.h" // Include test-only batch operations and kernels
 #include <chrono>
-#include <cinttypes>
 #include <cstdint>
 #include <cstring>
 #include <cuda_runtime.h>
@@ -298,7 +297,7 @@ protected:

 // Test basic addition (on GPU)
 TEST_F(FpArithmeticTest, Addition) {
-
+  uint64_t size_tracker = 0;
  Fp a, b, c, c_cpu;

  // Test: 1 + 1 = 2
@@ -321,7 +320,7 @@ TEST_F(FpArithmeticTest, Addition) {

 // Test subtraction (on GPU)
 TEST_F(FpArithmeticTest, Subtraction) {
-
+  uint64_t size_tracker = 0;
  Fp a, b, c, a_cpu;

  // Test: 2 - 1 = 1
@@ -342,7 +341,7 @@ TEST_F(FpArithmeticTest, Subtraction) {

 // Test multiplication (on GPU)
 TEST_F(FpArithmeticTest, Multiplication) {
-
+  uint64_t size_tracker = 0;
  Fp five, three, result, expected;

  fp_zero(five);
@@ -371,7 +370,7 @@ TEST_F(FpArithmeticTest, Multiplication) {

 // Test negation (on GPU)
 TEST_F(FpArithmeticTest, Negation) {
-
+  uint64_t size_tracker = 0;
  Fp a, neg_a, result;

  fp_zero(a);
@@ -396,7 +395,7 @@ TEST_F(FpArithmeticTest, Negation) {

 // Test Montgomery conversion round-trip (on GPU)
 TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {
-
+  uint64_t size_tracker = 0;
  Fp value, mont_form, back, mont_form_cpu, back_cpu;

  fp_zero(value);
@@ -422,7 +421,7 @@ TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {

 // Test Montgomery multiplication (on GPU)
 TEST_F(FpArithmeticTest, MontgomeryMultiplication) {
-
+  uint64_t size_tracker = 0;
  Fp five, three, five_m, three_m, result_m, result, expected, result_cpu;

  fp_zero(five);
@@ -461,7 +460,7 @@ TEST_F(FpArithmeticTest, MontgomeryMultiplication) {

 // Test comparison operations (on GPU)
 TEST_F(FpArithmeticTest, Comparison) {
-
+  uint64_t size_tracker = 0;
  Fp five, three;

  fp_zero(five);
@@ -482,7 +481,7 @@ TEST_F(FpArithmeticTest, Comparison) {

 // Test zero and one (on GPU)
 TEST_F(FpArithmeticTest, ZeroAndOne) {
-
+  uint64_t size_tracker = 0;
  Fp zero, one;

  fp_zero(zero);
@@ -500,7 +499,7 @@ TEST_F(FpArithmeticTest, ZeroAndOne) {

 // Test copy (on GPU)
 TEST_F(FpArithmeticTest, Copy) {
-
+  uint64_t size_tracker = 0;
  Fp a, b, b_cpu;

  fp_zero(a);
@@ -523,7 +522,7 @@ TEST_F(FpArithmeticTest, Copy) {

 // Test conditional move (on GPU)
 TEST_F(FpArithmeticTest, ConditionalMove) {
-
+  uint64_t size_tracker = 0;
  Fp a, b, result, result_cpu;

  fp_zero(a);
@@ -564,7 +563,7 @@ TEST_F(FpArithmeticTest, ConditionalMove) {

 // Test multiplication by zero (on GPU)
 TEST_F(FpArithmeticTest, MultiplicationByZero) {
-
+  uint64_t size_tracker = 0;
  Fp a, zero, result, result_cpu;

  fp_zero(zero);
@@ -592,7 +591,7 @@ TEST_F(FpArithmeticTest, MultiplicationByZero) {

 // Test inversion (on GPU)
 TEST_F(FpArithmeticTest, Inversion) {
-
+  uint64_t size_tracker = 0;
  Fp a, a_inv, result, a_inv_cpu;

  fp_zero(a);
@@ -624,7 +623,7 @@ TEST_F(FpArithmeticTest, Inversion) {

 // Test inversion of one (on GPU)
 TEST_F(FpArithmeticTest, InversionOfOne) {
-
+  uint64_t size_tracker = 0;
  Fp one, one_inv, one_inv_cpu;

  fp_one(one);
@@ -646,7 +645,7 @@ TEST_F(FpArithmeticTest, InversionOfOne) {

 // Test division (on GPU)
 TEST_F(FpArithmeticTest, Division) {
-
+  uint64_t size_tracker = 0;
  Fp a, b, quotient, result;

  fp_zero(a);
@@ -679,7 +678,7 @@ TEST_F(FpArithmeticTest, Division) {

 // Test division by one (on GPU)
 TEST_F(FpArithmeticTest, DivisionByOne) {
-
+  uint64_t size_tracker = 0;
  Fp a, one, result;

  fp_one(one);
@@ -708,7 +707,7 @@ TEST_F(FpArithmeticTest, DivisionByOne) {

 // Test exponentiation with small exponent (on GPU)
 TEST_F(FpArithmeticTest, ExponentiationSmall) {
-
+  uint64_t size_tracker = 0;
  Fp base, result, expected, result_cpu;

  fp_zero(base);
@@ -735,7 +734,7 @@ TEST_F(FpArithmeticTest, ExponentiationSmall) {

 // Test exponentiation to power of one (on GPU)
 TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {
-
+  uint64_t size_tracker = 0;
  Fp base, result, result_cpu;

  fp_zero(base);
@@ -759,7 +758,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {

 // Test exponentiation to power of zero (on GPU)
 TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {
-
+  uint64_t size_tracker = 0;
  Fp base, result, one, result_cpu;

  fp_zero(base);
@@ -783,7 +782,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {

 // Test exponentiation with large exponent (Fermat's little theorem)
 TEST_F(FpArithmeticTest, ExponentiationFermat) {
-
+  uint64_t size_tracker = 0;
  Fp a, result;

  fp_zero(a);
@@ -799,7 +798,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermat) {

 // Test exponentiation: a^(p-1) = 1 mod p
 TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {
-
+  uint64_t size_tracker = 0;
  Fp a, result, one;

  fp_zero(a);
@@ -821,7 +820,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {

 // Test square root (on GPU)
 TEST_F(FpArithmeticTest, SquareRoot) {
-
+  uint64_t size_tracker = 0;
  Fp a, square, sqrt_result, verify, square_cpu, sqrt_result_cpu, verify_cpu;

  // Test: sqrt(a^2) = a or -a
@@ -873,12 +872,6 @@ TEST_F(FpArithmeticTest, SquareRoot) {
    // Also test on CPU for comparison
    Fp neg_a_cpu = -a;

-    // Verify GPU negation matches CPU negation
-    EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &neg_a, &neg_a_cpu),
-              ComparisonType::Equal)
-        << "GPU negation should match CPU negation";
-    cuda_synchronize_stream(stream, gpu_index);
-
    bool matches_a = (fp_cmp_gpu(stream, gpu_index, &sqrt_result, &a) ==
                      ComparisonType::Equal);
    cuda_synchronize_stream(stream, gpu_index);
@@ -898,7 +891,7 @@ TEST_F(FpArithmeticTest, SquareRoot) {

 // Test square root of zero (on GPU)
 TEST_F(FpArithmeticTest, SquareRootOfZero) {
-
+  uint64_t size_tracker = 0;
  Fp zero, result, result_cpu;
  fp_zero(zero);

@@ -920,7 +913,7 @@ TEST_F(FpArithmeticTest, SquareRootOfZero) {

 // Test square root of one (on GPU)
 TEST_F(FpArithmeticTest, SquareRootOfOne) {
-
+  uint64_t size_tracker = 0;
  Fp one, result, result_cpu;
  fp_one(one);

@@ -942,7 +935,7 @@ TEST_F(FpArithmeticTest, SquareRootOfOne) {

 // Test quadratic residue check (on GPU)
 TEST_F(FpArithmeticTest, IsQuadraticResidue) {
-
+  uint64_t size_tracker = 0;
  Fp a, square, square_cpu, zero;

  fp_zero(a);
@@ -978,7 +971,7 @@ TEST_F(FpArithmeticTest, IsQuadraticResidue) {
 // device. For now, we test individual conversions on GPU and verify with GPU
 // comparisons
 TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {
-
+  uint64_t size_tracker = 0;
  const int n = 10;
  Fp normal[n], montgomery[n], back[n];

@@ -1018,7 +1011,7 @@ TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {

 // Test 1: Addition that doesn't overflow (on GPU)
 TEST_F(FpArithmeticTest, LargeAddition1) {
-
+  uint64_t size_tracker = 0;
  // a = large value
  Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
                             0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
@@ -1051,7 +1044,7 @@ TEST_F(FpArithmeticTest, LargeAddition1) {

 // Test 2: Addition that triggers reduction (sum > p) (on GPU)
 TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {
-
+  uint64_t size_tracker = 0;
  // Use two large numbers that will trigger reduction
  // a + b should wrap around modulus
  Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
@@ -1083,7 +1076,7 @@ TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {

 // Test 3: Subtraction without borrow (on GPU)
 TEST_F(FpArithmeticTest, LargeSubtraction1) {
-
+  uint64_t size_tracker = 0;
  // a = large value
  Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
                             0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
@@ -1113,15 +1106,11 @@ TEST_F(FpArithmeticTest, LargeSubtraction1) {
            ComparisonType::Equal)
      << "GPU result should match CPU result";
  cuda_synchronize_stream(stream, gpu_index);
-  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &verify, &verify_cpu),
-            ComparisonType::Equal)
-      << "GPU subtraction roundtrip should match CPU roundtrip";
-  cuda_synchronize_stream(stream, gpu_index);
 }

 // Test 4: Subtraction with borrow (a < b) (on GPU)
 TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {
-
+  uint64_t size_tracker = 0;
  // a = 50
  Fp a = test_utils::make_fp(0x32ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL,
                             0x0ULL);
@@ -1155,7 +1144,7 @@ TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {

 // Test 5: Multiplication of large values (triggers reduction) (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication1) {
-
+  uint64_t size_tracker = 0;
  // a = 2^200 (bit 200 set)
  Fp a;
  fp_zero(a);
@@ -1201,7 +1190,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication1) {

 // Test 6: (p-1) * (p-1) = 1 (mod p) (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
-
+  uint64_t size_tracker = 0;
  // a = p - 1
  Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
                             0x824e6dc3e23acdeeULL, 0xf75a64bbac71602ULL,
@@ -1239,7 +1228,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {

 // Test 7: Multiplication with 2: a * 2 = a + a (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
-
+  uint64_t size_tracker = 0;
  // a = large value
  Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAE6ULL,
                             0x412736E1F11D66F7ULL, 0x7BAD325DD638B01ULL,
@@ -1275,15 +1264,11 @@ TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
            ComparisonType::Equal)
      << "GPU result should match CPU result";
  cuda_synchronize_stream(stream, gpu_index);
-  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &expected, &expected_cpu),
-            ComparisonType::Equal)
-      << "GPU addition should match CPU addition";
-  cuda_synchronize_stream(stream, gpu_index);
 }

 // Test 8: Large number squared (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
-
+  uint64_t size_tracker = 0;
  // a = large value
  Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
                             0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL);
@@ -1320,7 +1305,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication4Square) {

 // Test 9: Addition chain near modulus (on GPU)
 TEST_F(FpArithmeticTest, LargeAddition3Chain) {
-
+  uint64_t size_tracker = 0;
  // Start with p-1
  Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
                             0x824e6dc3e23acdeeULL, 0x0f75a64bbac71602ULL,
@@ -1347,7 +1332,7 @@ TEST_F(FpArithmeticTest, LargeAddition3Chain) {

 // Test 10: Complex multiplication with reduction (on GPU)
 TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
-
+  uint64_t size_tracker = 0;
  // a = large prime-like number
  Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
                             0x0123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
@@ -1401,7 +1386,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {

 // Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
 TEST_F(FpPropertyTest, AdditionAssociativity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1427,7 +1412,7 @@ TEST_F(FpPropertyTest, AdditionAssociativity) {

 // Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
 TEST_F(FpPropertyTest, MultiplicationAssociativity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) { // Fewer iterations due to multiplication cost
    Fp a = random_value();
    Fp b = random_value();
@@ -1453,7 +1438,7 @@ TEST_F(FpPropertyTest, MultiplicationAssociativity) {

 // Test distributivity: a * (b + c) = a*b + a*c (on GPU)
 TEST_F(FpPropertyTest, MultiplicationDistributivity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1480,7 +1465,7 @@ TEST_F(FpPropertyTest, MultiplicationDistributivity) {

 // Test addition commutativity with random values (on GPU)
 TEST_F(FpPropertyTest, AdditionCommutativityRandom) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1498,7 +1483,7 @@ TEST_F(FpPropertyTest, AdditionCommutativityRandom) {

 // Test multiplication commutativity with random values (on GPU)
 TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1516,7 +1501,7 @@ TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {

 // Test additive identity: a + 0 = a (on GPU)
 TEST_F(FpPropertyTest, AdditiveIdentity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp result;
@@ -1532,7 +1517,7 @@ TEST_F(FpPropertyTest, AdditiveIdentity) {

 // Test multiplicative identity: a * 1 = a (on GPU)
 TEST_F(FpPropertyTest, MultiplicativeIdentity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp result;
@@ -1548,7 +1533,7 @@ TEST_F(FpPropertyTest, MultiplicativeIdentity) {

 // Test additive inverse: a + (-a) = 0 (on GPU)
 TEST_F(FpPropertyTest, AdditiveInverse) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp neg_a, result;
@@ -1565,7 +1550,7 @@ TEST_F(FpPropertyTest, AdditiveInverse) {

 // Test double negation: -(-a) = a (on GPU)
 TEST_F(FpPropertyTest, DoubleNegation) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp neg_a, neg_neg_a;
@@ -1583,7 +1568,7 @@ TEST_F(FpPropertyTest, DoubleNegation) {

 // Test subtraction as addition of negation: a - b = a + (-b) (on GPU)
 TEST_F(FpPropertyTest, SubtractionAsNegation) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1605,7 +1590,7 @@ TEST_F(FpPropertyTest, SubtractionAsNegation) {

 // Test Montgomery form round-trip with random values (on GPU)
 TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp mont_form, back;
@@ -1622,7 +1607,7 @@ TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {

 // Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
 TEST_F(FpPropertyTest, MultiplicativeInverse) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1645,7 +1630,7 @@ TEST_F(FpPropertyTest, MultiplicativeInverse) {

 // Test division: (a / b) * b = a (on GPU)
 TEST_F(FpPropertyTest, DivisionProperty) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1669,7 +1654,7 @@ TEST_F(FpPropertyTest, DivisionProperty) {

 // Test division as multiplication by inverse: a / b = a * b^(-1) (on GPU)
 TEST_F(FpPropertyTest, DivisionAsInverse) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp b = random_value();
@@ -1697,7 +1682,7 @@ TEST_F(FpPropertyTest, DivisionAsInverse) {

 // Test exponentiation: (a^e1)^e2 = a^(e1*e2) for small exponents (on GPU)
 TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1731,7 +1716,7 @@ TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {

 // Test exponentiation: a^e1 * a^e2 = a^(e1+e2) (on GPU)
 TEST_F(FpPropertyTest, ExponentiationProduct) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1766,7 +1751,7 @@ TEST_F(FpPropertyTest, ExponentiationProduct) {

 // Test inversion of inversion: (a^(-1))^(-1) = a (on GPU)
 TEST_F(FpPropertyTest, DoubleInversion) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    // Skip zero (on GPU)
@@ -1790,7 +1775,7 @@ TEST_F(FpPropertyTest, DoubleInversion) {

 // Test square root property: sqrt(a^2) = a (for random a) (on GPU)
 TEST_F(FpPropertyTest, SquareRootProperty) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp a = random_value();
    Fp square, sqrt_result, verify;
@@ -1834,7 +1819,7 @@ TEST_F(FpPropertyTest, SquareRootProperty) {
 // Test quadratic residue property: squares are always quadratic residues (on
 // GPU)
 TEST_F(FpPropertyTest, QuadraticResidueProperty) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp a = random_value();
    Fp square;
@@ -1856,7 +1841,7 @@ TEST_F(FpPropertyTest, QuadraticResidueProperty) {

 // Test operations with p-1 (on GPU)
 TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {
-
+  uint64_t size_tracker = 0;
  // (p-1) + 1 = 0 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &modulus_minus_one, &one);
@@ -1883,7 +1868,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {

 // Test operations with p-2 (on GPU)
 TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {
-
+  uint64_t size_tracker = 0;
  // (p-2) + 1 = p-1 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &modulus_minus_two, &one);
@@ -1903,7 +1888,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {

 // Test operations with very small values (on GPU)
 TEST_F(FpEdgeCaseTest, VerySmallValues) {
-
+  uint64_t size_tracker = 0;
  Fp zero_val, one_val, two_val, three_val;
  fp_zero(zero_val);
  fp_one(one_val);
@@ -1945,7 +1930,7 @@ TEST_F(FpEdgeCaseTest, VerySmallValues) {

 // Test operations with max limb values (on GPU)
 TEST_F(FpEdgeCaseTest, MaxLimbValues) {
-
+  uint64_t size_tracker = 0;
  // Test that max_limb_value is valid
  EXPECT_TRUE(test_utils::is_valid_fp(max_limb_value))
      << "max_limb_value should be < p";
@@ -1968,7 +1953,7 @@ TEST_F(FpEdgeCaseTest, MaxLimbValues) {

 // Test operations with alternating bit patterns (on GPU)
 TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {
-
+  uint64_t size_tracker = 0;
  // Test that alternating_bits is valid
  EXPECT_TRUE(test_utils::is_valid_fp(alternating_bits))
      << "alternating_bits should be < p";
@@ -1991,7 +1976,7 @@ TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {

 // Test edge case: zero operations (on GPU)
 TEST_F(FpEdgeCaseTest, ZeroOperations) {
-
+  uint64_t size_tracker = 0;
  // 0 + 0 = 0 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &zero, &zero);
@@ -2021,7 +2006,7 @@ TEST_F(FpEdgeCaseTest, ZeroOperations) {

 // Test edge case: one operations (on GPU)
 TEST_F(FpEdgeCaseTest, OneOperations) {
-
+  uint64_t size_tracker = 0;
  // 1 + 1 = 2 (on GPU)
  Fp result;
  fp_add_gpu(stream, gpu_index, &result, &one, &one);
@@ -2048,7 +2033,7 @@ TEST_F(FpEdgeCaseTest, OneOperations) {

 // Test fp_one_montgomery (on GPU)
 TEST_F(FpEdgeCaseTest, OneMontgomery) {
-
+  uint64_t size_tracker = 0;
  Fp one_mont, one_normal;
  fp_one(one_normal);
  fp_one_montgomery(one_mont);
@@ -2064,7 +2049,7 @@ TEST_F(FpEdgeCaseTest, OneMontgomery) {

 // Test repeated operations (stress test) (on GPU)
 TEST_F(FpEdgeCaseTest, RepeatedOperations) {
-
+  uint64_t size_tracker = 0;
  Fp a = test_utils::random_fp(rng);
  Fp result = a;

@@ -2106,7 +2091,7 @@ TEST_F(FpEdgeCaseTest, RepeatedOperations) {

 // Test CUDA kernel: array addition
 TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {
-
+  uint64_t size_tracker = 0;
  const int n = 1000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2141,7 +2126,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {

 // Test CUDA kernel: array multiplication
 TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
-
+  uint64_t size_tracker = 0;
  const int n = 1000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2181,7 +2166,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {

 // Test CUDA kernel: array addition with edge cases
 TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {
-
+  uint64_t size_tracker = 0;
  const int n = 100;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2231,7 +2216,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {

 // Test CUDA kernel: array multiplication with edge cases
 TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
-
+  uint64_t size_tracker = 0;
  const int n = 100;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2286,7 +2271,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {

 // Test CUDA kernel: large array
 TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
-
+  uint64_t size_tracker = 0;
  const int n = 10000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2327,7 +2312,7 @@ TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
 // Test CUDA kernel: boundary conditions for launch configuration
 // Tests that the "if (idx < n)" check works correctly at block boundaries
 TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {
-
+  uint64_t size_tracker = 0;
  // Test sizes that stress the launch configuration
  // threadsPerBlock = 256, so test around block boundaries
  std::vector<int> test_sizes = {1,   255, 256,  257,  511,
@@ -2367,7 +2352,7 @@ TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {

 // Test CUDA kernel: verify kernel actually launches (not just CPU fallback)
 TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {
-
+  uint64_t size_tracker = 0;
  const int n = 1000;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2399,7 +2384,7 @@ TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {

 // Test CUDA kernel: verify device constant memory is accessible
 TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
-
+  uint64_t size_tracker = 0;
  // This test verifies that DEVICE_MODULUS is properly initialized
  // by running a kernel that uses it (multiplication uses Montgomery which
  // needs modulus)
@@ -2443,7 +2428,7 @@ TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {

 // Test CUDA kernel: empty array (edge case)
 TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {
-
+  uint64_t size_tracker = 0;
  const int n = 0;
  Fp *h_a = nullptr;
  Fp *h_b = nullptr;
@@ -2460,7 +2445,7 @@ TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {

 // Test CUDA kernel: single element
 TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
-
+  uint64_t size_tracker = 0;
  const int n = 1;
  Fp *h_a = new Fp[n];
  Fp *h_b = new Fp[n];
@@ -2486,53 +2471,44 @@ TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
 // ============================================================================

 // Test to print generator values (for hardcoding)
-// PRIx64 format specifiers require 64-bit limbs
-#if LIMB_BITS_CONFIG == 64
 TEST_F(FpArithmeticTest, PrintGenerators) {
-
+  uint64_t size_tracker = 0;
  const G1Affine &g1 = g1_generator();
  const G2Affine &g2 = g2_generator();

  printf("\n=== G1 Generator (Montgomery form) ===\n");
-  printf("x: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL}\n",
+  printf("x: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
+         "0x%llxULL, 0x%llxULL}\n",
         g1.x.limb[0], g1.x.limb[1], g1.x.limb[2], g1.x.limb[3], g1.x.limb[4],
         g1.x.limb[5], g1.x.limb[6]);
-  printf("y: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL}\n",
+  printf("y: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
+         "0x%llxULL, 0x%llxULL}\n",
         g1.y.limb[0], g1.y.limb[1], g1.y.limb[2], g1.y.limb[3], g1.y.limb[4],
         g1.y.limb[5], g1.y.limb[6]);

  printf("\n=== G2 Generator (Montgomery form) ===\n");
-  printf("x.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL}\n",
+  printf("x.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
+         "0x%llxULL, 0x%llxULL}\n",
         g2.x.c0.limb[0], g2.x.c0.limb[1], g2.x.c0.limb[2], g2.x.c0.limb[3],
         g2.x.c0.limb[4], g2.x.c0.limb[5], g2.x.c0.limb[6]);
-  printf("x.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL}\n",
+  printf("x.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
+         "0x%llxULL, 0x%llxULL}\n",
         g2.x.c1.limb[0], g2.x.c1.limb[1], g2.x.c1.limb[2], g2.x.c1.limb[3],
         g2.x.c1.limb[4], g2.x.c1.limb[5], g2.x.c1.limb[6]);
-  printf("y.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL}\n",
+  printf("y.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
+         "0x%llxULL, 0x%llxULL}\n",
         g2.y.c0.limb[0], g2.y.c0.limb[1], g2.y.c0.limb[2], g2.y.c0.limb[3],
         g2.y.c0.limb[4], g2.y.c0.limb[5], g2.y.c0.limb[6]);
-  printf("y.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
-         "0x%" PRIx64 "ULL}\n",
+  printf("y.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
+         "0x%llxULL, 0x%llxULL}\n",
         g2.y.c1.limb[0], g2.y.c1.limb[1], g2.y.c1.limb[2], g2.y.c1.limb[3],
         g2.y.c1.limb[4], g2.y.c1.limb[5], g2.y.c1.limb[6]);
  printf("\n");
 }
-#endif

 // Test is_on_curve_g1 with point at infinity
 TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
-
+  uint64_t size_tracker = 0;
  G1Affine point;
  g1_point_at_infinity(point);

@@ -2544,7 +2520,7 @@ TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
 // We'll create a point by starting with a valid y and computing x
 // Or use a known valid point
 TEST_F(FpArithmeticTest, CurveG1ValidPoint) {
-
+  uint64_t size_tracker = 0;
  G1Affine point;
  point.infinity = false;

@@ -2617,7 +2593,7 @@ TEST_F(FpArithmeticTest, CurveG1ValidPoint) {

 // Test is_on_curve_g1 with invalid point
 TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {
-
+  uint64_t size_tracker = 0;
  G1Affine point;
  point.infinity = false;

@@ -2630,7 +2606,7 @@ TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {

 // Test that negating y preserves curve validity (on GPU)
 TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {
-
+  uint64_t size_tracker = 0;
  G1Affine point;
  point.infinity = false;

@@ -2674,7 +2650,7 @@ TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {

 // Test is_on_curve_g2 with point at infinity
 TEST_F(FpArithmeticTest, CurveG2PointAtInfinity) {
-
+  uint64_t size_tracker = 0;
  G2Affine point;
  g2_point_at_infinity(point);

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp2.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp2.cu
@@ -152,7 +152,7 @@ protected:

 // Test basic addition (on GPU)
 TEST_F(Fp2ArithmeticTest, Addition) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, b, c, c_cpu;

  // Test: (1 + 0*i) + (1 + 0*i) = (2 + 0*i)
@@ -175,7 +175,7 @@ TEST_F(Fp2ArithmeticTest, Addition) {

 // Test subtraction (on GPU)
 TEST_F(Fp2ArithmeticTest, Subtraction) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, b, c, a_cpu;

  // Test: (2 + 0*i) - (1 + 0*i) = (1 + 0*i)
@@ -197,7 +197,7 @@ TEST_F(Fp2ArithmeticTest, Subtraction) {

 // Test multiplication (on GPU)
 TEST_F(Fp2ArithmeticTest, Multiplication) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, b, result, expected, result_cpu;

  // Test: (1 + 1*i) * (1 + 1*i) = (0 + 2*i)
@@ -224,7 +224,7 @@ TEST_F(Fp2ArithmeticTest, Multiplication) {

 // Test i * i = -1 (on GPU)
 TEST_F(Fp2ArithmeticTest, I_Squared) {
-
+  uint64_t size_tracker = 0;
  Fp2 i_val, result, expected, result_cpu;

  // i = 0 + 1*i
@@ -250,7 +250,7 @@ TEST_F(Fp2ArithmeticTest, I_Squared) {

 // Test negation (on GPU)
 TEST_F(Fp2ArithmeticTest, Negation) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, neg_a, result, neg_a_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -273,7 +273,7 @@ TEST_F(Fp2ArithmeticTest, Negation) {

 // Test conjugation (on GPU)
 TEST_F(Fp2ArithmeticTest, Conjugation) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, conj, result, conj_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -304,7 +304,7 @@ TEST_F(Fp2ArithmeticTest, Conjugation) {

 // Test squaring (on GPU)
 TEST_F(Fp2ArithmeticTest, Squaring) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, square, square_cpu;

  // Test: (1 + 1*i)^2 = 2*i
@@ -327,7 +327,7 @@ TEST_F(Fp2ArithmeticTest, Squaring) {

 // Test zero and one (on GPU)
 TEST_F(Fp2ArithmeticTest, ZeroAndOne) {
-
+  uint64_t size_tracker = 0;
  Fp2 zero_val, one_val;

  fp2_zero(zero_val);
@@ -349,7 +349,7 @@ TEST_F(Fp2ArithmeticTest, ZeroAndOne) {

 // Test copy (on GPU)
 TEST_F(Fp2ArithmeticTest, Copy) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, b, b_cpu;

  a = test_utils_fp2::make_fp2_simple(42, 123);
@@ -370,7 +370,7 @@ TEST_F(Fp2ArithmeticTest, Copy) {

 // Test conditional move (on GPU)
 TEST_F(Fp2ArithmeticTest, ConditionalMove) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, b, result, result_cpu;

  a = test_utils_fp2::make_fp2_simple(10, 20);
@@ -411,7 +411,7 @@ TEST_F(Fp2ArithmeticTest, ConditionalMove) {

 // Test multiplication by zero (on GPU)
 TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, zero_val, result, result_cpu;

  fp2_zero(zero_val);
@@ -432,7 +432,7 @@ TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {

 // Test inversion (on GPU)
 TEST_F(Fp2ArithmeticTest, Inversion) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, a_inv, result, a_inv_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -456,7 +456,7 @@ TEST_F(Fp2ArithmeticTest, Inversion) {

 // Test division (on GPU)
 TEST_F(Fp2ArithmeticTest, Division) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, b, quotient, result, quotient_cpu, result_cpu;

  a = test_utils_fp2::make_fp2_simple(10, 6);
@@ -482,7 +482,7 @@ TEST_F(Fp2ArithmeticTest, Division) {

 // Test multiply by i (on GPU)
 TEST_F(Fp2ArithmeticTest, MultiplyByI) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, result, result_cpu;

  // Test: (a + b*i) * i = -b + a*i
@@ -509,7 +509,7 @@ TEST_F(Fp2ArithmeticTest, MultiplyByI) {

 // Test Frobenius map (on GPU)
 TEST_F(Fp2ArithmeticTest, Frobenius) {
-
+  uint64_t size_tracker = 0;
  Fp2 a, frob, conj, frob_cpu, conj_cpu;

  a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -541,7 +541,7 @@ TEST_F(Fp2ArithmeticTest, Frobenius) {

 // Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
 TEST_F(Fp2PropertyTest, AdditionAssociativity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -567,7 +567,7 @@ TEST_F(Fp2PropertyTest, AdditionAssociativity) {

 // Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -593,7 +593,7 @@ TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {

 // Test distributivity: a * (b + c) = a*b + a*c (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -620,7 +620,7 @@ TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {

 // Test addition commutativity (on GPU)
 TEST_F(Fp2PropertyTest, AdditionCommutativity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -638,7 +638,7 @@ TEST_F(Fp2PropertyTest, AdditionCommutativity) {

 // Test multiplication commutativity (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    Fp2 b = random_value();
@@ -656,7 +656,7 @@ TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {

 // Test additive identity: a + 0 = a (on GPU)
 TEST_F(Fp2PropertyTest, AdditiveIdentity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 result;
@@ -672,7 +672,7 @@ TEST_F(Fp2PropertyTest, AdditiveIdentity) {

 // Test multiplicative identity: a * 1 = a (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 result;
@@ -688,7 +688,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {

 // Test additive inverse: a + (-a) = 0 (on GPU)
 TEST_F(Fp2PropertyTest, AdditiveInverse) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 100; i++) {
    Fp2 a = random_value();
    Fp2 neg_a, result;
@@ -705,7 +705,7 @@ TEST_F(Fp2PropertyTest, AdditiveInverse) {

 // Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
 TEST_F(Fp2PropertyTest, MultiplicativeInverse) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();
    // Skip zero
@@ -726,7 +726,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeInverse) {

 // Test square vs multiply by self: a^2 = a * a (on GPU)
 TEST_F(Fp2PropertyTest, SquareVsMultiply) {
-
+  uint64_t size_tracker = 0;
  for (int i = 0; i < 50; i++) {
    Fp2 a = random_value();

@@ -747,7 +747,7 @@ TEST_F(Fp2PropertyTest, SquareVsMultiply) {

 // Test CUDA kernel: array addition
 TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {
-
+  uint64_t size_tracker = 0;
  const int n = 1000;
  Fp2 *h_a = new Fp2[n];
  Fp2 *h_b = new Fp2[n];
@@ -784,7 +784,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {

 // Test CUDA kernel: array multiplication
 TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {
-
+  uint64_t size_tracker = 0;
  const int n = 1000;
  Fp2 *h_a = new Fp2[n];
  Fp2 *h_b = new Fp2[n];
@@ -825,7 +825,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {

 // Test is_on_curve_g2 with point at infinity
 TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {
-
+  uint64_t size_tracker = 0;
  G2Affine point;
  g2_point_at_infinity(point);

@@ -835,7 +835,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {

 // Test is_on_curve_g2 with valid point construction
 TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {
-
+  uint64_t size_tracker = 0;
  G2Affine point;
  point.infinity = false;

@@ -860,7 +860,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {

 // Test that field operations maintain curve validity for G2
 TEST_F(Fp2ArithmeticTest, CurveG2FieldOperationsConsistency) {
-
+  uint64_t size_tracker = 0;
  // Create a point (we'll test the consistency check works)
  G2Affine point;
  point.infinity = false;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	014b9d1b5c	WIP: run gpu documentation benchmarks on scaleway	2026-03-26 15:12:55 +01:00
David Testé	3518aa4ed2	chore(ci): add terraform script for gpu benchmarks This would spawn a H100-SXM-8-80G on Scaleway platform.	2026-03-25 15:28:42 +01:00