fix(gpu): refactor crypto params in backend

chore(deps): bump zizmorcore/zizmor-action from 0.5.2 to 0.5.3
Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.5.2 to 0.5.3. - [Release notes](https://github.com/zizmorcore/zizmor-action/releases) - [Commits](71321a20a9...b1d7e1fb5d) --- updated-dependencies: - dependency-name: zizmorcore/zizmor-action dependency-version: 0.5.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>
2026-04-28 03:01:21 -04:00 · 2026-04-27 13:09:50 +02:00 · 2026-04-27 10:29:38 +02:00 · 2026-04-27 10:29:25 +02:00 · 2026-04-27 09:49:03 +02:00 · 2026-04-23 15:32:17 +02:00
378 changed files with 16566 additions and 10353 deletions
--- a/.github/workflows/aws_data_tests.yml
+++ b/.github/workflows/aws_data_tests.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -16,7 +16,6 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
@@ -37,6 +36,7 @@ jobs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -79,6 +79,7 @@ jobs:
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+              - utils/tfhe-safe-serialize/**
            csprng:
              - tfhe-csprng/**
            zk_pok:
@@ -86,6 +87,8 @@ jobs:
            versionable:
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+            safe_serialize:
+              - utils/tfhe-safe-serialize/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -122,6 +125,7 @@ jobs:
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -145,7 +149,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -170,6 +174,11 @@ jobs:
        run: |
          make test_versionable

+      - name: Run tfhe-safe-serialize tests
+        if: needs.should-run.outputs.safe_serialize_test == 'true'
+        run: |
+          make test_safe_serialize
+
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -191,7 +200,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -204,7 +213,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -34,7 +34,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -14,12 +14,11 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]

 permissions:
  contents: read
@@ -32,16 +31,16 @@ jobs:
    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read  # Needed to check for file change
+      pull-requests: read # Needed to check for file change
    outputs:
      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
-          steps.changed-files.outputs.wasm_any_changed }}
+        steps.changed-files.outputs.wasm_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -63,6 +62,7 @@ jobs:
                - tfhe/js_on_wasm_tests/**
                - tfhe/web_wasm_parallel_tests/**
                - utils/tfhe-versionable/**
+                - utils/tfhe-safe-serialize/**
                - .github/workflows/aws_tfhe_wasm_tests.yml

  wasm-tests:
@@ -78,7 +78,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -92,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -105,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/backward_compat_pr_change_report.yml
+++ b/.github/workflows/backward_compat_pr_change_report.yml
@@ -6,6 +6,9 @@ name: backward_compat_pr_change_report
 on:
  pull_request:

+env:
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
 permissions:
  contents: read

@@ -14,9 +17,35 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  should-run:
+    name: backward_compat_pr_change_report/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            backward:
+              - utils/tfhe-lints/snapshots/*.json
+
  change-report:
    name: backward_compat_pr_change_report/change-report (bpr)
    runs-on: ubuntu-latest
+    needs: should-run
+    if:
+      needs.should-run.outputs.backward_report == 'true'
    permissions:
      pull-requests: write # To send and modify message in the PR
    steps:
@@ -50,19 +79,11 @@ jobs:
            exit 1
          fi

-      - name: Find existing comment
+      - name: Post/refresh backward-compat report
        if: steps.report.outputs.has_report == 'true'
-        id: find-comment
-        uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4.0.0
+        uses: marocchino/sticky-pull-request-comment@0ea0beb66eb9baf113663a64ec522f60e49231c0
        with:
-          issue-number: ${{ github.event.pull_request.number }}
-          body-includes: '**Backward-compat snapshot:'
-
-      - name: Comment on PR
-        if: steps.report.outputs.has_report == 'true'
-        uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0
-        with:
-          comment-id: ${{ steps.find-comment.outputs.comment-id }}
-          issue-number: ${{ github.event.pull_request.number }}
-          body-path: report.md
-          edit-mode: replace
+          header: backward-compat-snapshot
+          hide_and_recreate: true
+          hide_classify: OUTDATED
+          path: report.md
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -19,7 +19,7 @@ on:
          - shortint_oprf
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
          - hlapi_kvstore
@@ -93,8 +93,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -223,7 +223,7 @@ jobs:
          results_type: ${{ inputs.additional_results_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -108,14 +108,14 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-hlapi-erc20:
-    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
+  run-benchmarks-hlapi-erc7984:
+    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
-      command: hlapi_erc20
-      additional_file_to_parse: erc20_pbs_count.csv
+      command: hlapi_erc7984
+      additional_file_to_parse: erc7984_pbs_count.csv
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_ct_key_sizes
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -17,6 +17,10 @@ on:
        description: "Run GPU core-crypto benchmarks"
        type: boolean
        default: true
+      run-gpu-zk-benchmarks:
+        description: "Run GPU ZK benchmarks"
+        type: boolean
+        default: true
      run-hpu-benchmarks:
        description: "Run HPU benchmarks"
        type: boolean
@@ -36,7 +40,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -91,7 +95,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc20
+      command: integer_multi_bit,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -110,7 +114,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -165,21 +169,42 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

+  run-benchmarks-gpu-zk-server:
+    name: benchmark_documentation/run-benchmarks-gpu-zk-server
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-zk-benchmarks
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: integer_zk
+      op_flavor: default
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
-      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
+      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
+      run-benchmarks-gpu-zk-server
    ]
    uses: ./.github/workflows/generate_svgs.yml
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -188,7 +213,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -37,7 +37,7 @@ on:
          - integer_zk_experimental
          - integer_aes
          - integer_aes256
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
      op_flavor:
@@ -123,8 +123,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -89,7 +89,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -173,7 +173,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -126,17 +126,11 @@ jobs:
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,25 +139,6 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

-      - name: Acknowledge remote instance failure
-        if: steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile != 'single-h100'
-        run: |
-          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
-          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
-          exit 1
-        env:
-          INPUTS_PROFILE: ${{ inputs.profile }}
-
-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' &&
-          steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile == 'single-h100'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  # Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
  install-dependencies:
    name: benchmark_gpu_common/install-dependencies
@@ -184,7 +159,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -296,7 +270,7 @@ jobs:
          filenames: ${{ inputs.additional_file_to_parse }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -333,13 +307,13 @@ jobs:

  teardown-instance:
    name: benchmark_gpu_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -42,7 +42,7 @@ env:
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
  SCHEDULING_POLICY: "MAX_PARALLELISM"
-  BENCHMARKS: "erc20"
+  BENCHMARKS: "erc7984"
  BRANCH_NAME: ${{ github.ref_name }}
  COMMIT_SHA: ${{ github.sha }}
  SLAB_SECRET: ${{ secrets.JOB_SECRET }}
@@ -94,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: |
            ~/.cargo/registry
@@ -214,14 +214,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -232,7 +232,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: 20.x

@@ -248,13 +248,13 @@ jobs:
          npm install && npm run deploy:emptyProxies && npx hardhat compile
        working-directory: fhevm/

-      - name: Profile erc20 no-cmux benchmark on GPU
+      - name: Profile erc7984 no-cmux benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
          BENCHMARK_TYPE="THROUGHPUT_200" \
          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
-          make -e "profile_erc20_gpu"
+          make -e "profile_erc7984_gpu"
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
@@ -271,7 +271,7 @@ jobs:
      - name: Upload profile artifact
        env:
          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ env.REPORT_NAME }}
          path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}
@@ -302,7 +302,7 @@ jobs:
        working-directory: fhevm/

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${COMMIT_SHA}_${BENCHMARKS}_${{ needs.parse-inputs.outputs.profile }}
          path: fhevm/$${{ env.RESULTS_FILENAME }}
@@ -333,7 +333,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -14,7 +14,7 @@ on:
          - integer
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -185,7 +185,7 @@ jobs:
          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -280,7 +280,7 @@ jobs:
          BENCH_TYPE: ${{ env.__TFHE_RS_BENCH_TYPE }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_regression_${{ env.RESULTS_FILE_SHA }} # RESULT_FILE_SHA is needed to avoid collision between matrix.command runs
          path: ${{ env.RESULTS_FILENAME }}
@@ -387,7 +387,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_ntt
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -77,7 +77,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -137,7 +137,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -180,7 +180,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -218,7 +218,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -138,7 +138,7 @@ jobs:
      - name: Node cache restoration
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -151,7 +151,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -43,14 +43,14 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2
+        uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -74,7 +74,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -88,7 +88,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -87,7 +87,7 @@ jobs:

      - name: Upload tables
        if: inputs.backend_comparison == false
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
          # This will upload all the file generated
@@ -111,7 +111,7 @@ jobs:

      - name: Upload comparison tables
        if: inputs.backend_comparison == true
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_backends_comparison_tables
          # This will upload all the file generated
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -209,60 +209,98 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

+  gpu-zk-server-latency-table:
+    name: generate_documentation_svgs/gpu-zk-server-latency-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: latency
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-latency
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  gpu-zk-server-throughput-table:
+    name: generate_documentation_svgs/gpu-zk-server-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: throughput
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
  # -----------------------------------------------------------
-  # ERC20 benchmarks tables
+  # ERC7984 benchmarks tables
  # -----------------------------------------------------------

-  cpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
+  cpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
      backend: cpu
      hardware_name: hpc7a.96xlarge
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
+      output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  gpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
+  gpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-gpu-svgs
    with:
      backend: gpu
      hardware_name: n3-H100-SXM5x8
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: multi_bit
      grouping_factor: 4
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
+      output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  hpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
+  hpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-hpu-svgs
    with:
      backend: hpu
      hardware_name: hpu_x1
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
+      output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -62,29 +63,24 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,13 +89,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -132,7 +121,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_core_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -77,7 +77,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,17 +25,11 @@ jobs:
    name: gpu_full_h100_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,13 +38,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: gpu_full_h100_tests/cuda-tests-linux
    needs: [ setup-instance ]
@@ -74,7 +61,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -118,13 +104,13 @@ jobs:

  teardown-instance:
    name: gpu_full_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +186,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -65,27 +66,23 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -133,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -184,14 +173,14 @@ jobs:

  teardown-instance:
    name: gpu_hlapi_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -17,8 +17,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Nightly tests will be triggered each evening 8p.m.
-    - cron: "0 20 * * *"
+    # Weekly tests will be triggered every Monday at 8p.m.
+    - cron: "0 20 * * 1"
  pull_request:


@@ -28,17 +28,48 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  should-run:
+    name: gpu_integer_long_run_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - '.github/workflows/gpu_integer_long_run_tests.yml'
+
  setup-instance:
    name: gpu_integer_long_run_tests/setup-instance
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    needs: [should-run]
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +143,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -131,6 +131,10 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          make semgrep_and_lint_gpu_code
+
      - name: Run fmt checks
        run: |
          make check_fmt_gpu
@@ -139,10 +143,6 @@ jobs:
        run: |
          make pcc_gpu

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          make semgrep_and_lint_gpu_code
-
      - name: Run semver checks on tfhe-cuda-backend
        run: |
          make semver_check_cuda_backend
@@ -176,7 +176,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,30 +64,25 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_signed_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +177,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,30 +64,25 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_unsigned_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +177,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -55,12 +55,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/zk/**
              - tfhe-zk-pok/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

@@ -76,7 +73,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +167,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -62,7 +62,7 @@ jobs:
          PACKAGE: ${{ inputs.package-name }}
        run: |
          cargo package -p "${PACKAGE}"
-      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate
@@ -107,7 +107,7 @@ jobs:
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
--- a/.github/workflows/make_release_common_cuda.yml
+++ b/.github/workflows/make_release_common_cuda.yml
@@ -1,12 +1,36 @@
-name: make_release_cuda
+# Common workflow to make crate release for CUDA backend
+name: make_release_common_cuda

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
-      dry_run:
-        description: "Dry-run"
+      package-name:
+        type: string
+        required: true
+      dry-run:
        type: boolean
        default: true
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true
+      ALLOWED_TEAM:
+        required: true
+      READ_ORG_TOKEN:
+        required: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -21,15 +45,15 @@ permissions: {}

 jobs:
  verify-triggering-actor:
-    name: make_release_cuda/verify-triggering-actor
+    name: make_release_common_cuda/verify-triggering-actor
    if: startsWith(github.ref, 'refs/tags/')
    uses: ./.github/workflows/verify_triggering_actor.yml
    secrets:
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  setup-instance:
-    name: make_release_cuda/setup-instance
+    name: make_release_common_cuda/setup-instance
    needs: verify-triggering-actor
    runs-on: ubuntu-latest
    outputs:
@@ -37,7 +61,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,7 +71,7 @@ jobs:
          profile: gpu-build

  package:
-    name: make_release_cuda/package
+    name: make_release_common_cuda/package
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    outputs:
@@ -76,7 +100,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -89,7 +112,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -101,12 +123,14 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Prepare package
+        env:
+          PACKAGE: ${{ inputs.package-name }}
        run: |
-          cargo package -p tfhe-cuda-backend
+          cargo package -p "${PACKAGE}"

-      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate

      - name: generate hash
@@ -114,8 +138,8 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
-    name: make_release_cuda/provenance
-    if: ${{ !inputs.dry_run  }}
+    name: make_release_common_cuda/provenance
+    if: ${{ !inputs.dry-run  }}
    needs: [package]
    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
@@ -128,7 +152,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish-cuda-release:
-    name: make_release_cuda/publish-cuda-release
+    name: make_release_common_cuda/publish-cuda-release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    permissions:
@@ -150,7 +174,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -163,7 +186,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -174,25 +196,33 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
      - name: Download artifact
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
        env:
          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+          PACKAGE: ${{ inputs.package-name }}
+          DRY_RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-cuda-backend ${DRY_RUN}
+          cargo publish -p "${PACKAGE}" ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -204,7 +234,7 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -212,17 +242,17 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: make_release_cuda/teardown-instance
+    name: make_release_common_cuda/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, publish-cuda-release]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -232,7 +262,7 @@ jobs:

      - name: Slack Notification
        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -16,6 +16,10 @@ on:
        description: "Push web js package"
        type: boolean
        default: true
+      push_web_compat_package:
+        description: "Push web compat (cross-origin) js package"
+        type: boolean
+        default: true
      push_node_package:
        description: "Push node js package"
        type: boolean
@@ -85,7 +89,7 @@ jobs:
          make build_web_js_api_parallel

      - name: Authenticate on NPM
-        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '24'
          registry-url: 'https://registry.npmjs.org'
@@ -99,6 +103,23 @@ jobs:
          tag: ${{ env.NPM_TAG }}
          provenance: true

+      - name: Build web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        run: |
+          rm -rf tfhe/pkg
+
+          make build_web_js_api
+          sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
+
+      - name: Publish web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
+        with:
+          package: tfhe/pkg/package.json
+          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}
+          provenance: true
+
      - name: Build Node package
        if: ${{ inputs.push_node_package }}
        run: |
--- a/.github/workflows/make_release_tfhe_cuda.yml
+++ b/.github/workflows/make_release_tfhe_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of tfhe-rs CUDA backend on crates.io.
+name: make_release_tfhe_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "tfhe-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/make_release_tfhe_safe_serialize.yml
+++ b/.github/workflows/make_release_tfhe_safe_serialize.yml
@@ -0,0 +1,32 @@
+name: make_release_tfhe_safe_serialize
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_safe_serialize/make-release
+    uses: ./.github/workflows/make_release_common.yml
+    with:
+      package-name: "tfhe-safe-serialize"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
--- a/.github/workflows/make_release_zk_cuda.yml
+++ b/.github/workflows/make_release_zk_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
+name: make_release_zk_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_zk_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "zk-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Restore Sagemath image from cache
        id: docker-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
      - name: Store Sagemath image in cache
        if: steps.docker-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,7 @@ members = [
    "utils/tfhe-backward-compat-checker",
    "utils/tfhe-backward-compat-data",
    "utils/tfhe-backward-compat-data/crates/add_new_version",
+    "utils/tfhe-safe-serialize",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/wasm-par-mq",
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/61
+++ b/61
@@ -312,7 +312,7 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
 	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
 		| grep -v '/cmake-build-debug/' \
 		| grep -v '/build/' \
-		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
+		| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
 	venv/bin/python3 "scripts/check_scratch_cleanup.py"

 .PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
@@ -360,7 +360,7 @@ check_fmt_toml: install_taplo

 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
-	@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"
+	@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" ":!*.hpu" | typos --file-list - && echo "No typos found"

 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
@@ -557,6 +557,11 @@ clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-versionable -- --no-deps -D warnings

+.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
+clippy_safe_serialize: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-safe-serialize -- --no-deps -D warnings
+
 .PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
 clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -582,6 +587,17 @@ clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selec
 		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
 	fi

+.PHONY: check_backward_compat_locks_did_not_change # Check backward compat Cargo.lock files are up to date
+check_backward_compat_locks_did_not_change: install_rs_check_toolchain
+	@for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
+		echo "checking Cargo.lock for $$crate"; \
+		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
+			-C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate metadata --locked --format-version 1 > /dev/null || \
+		( echo "Cargo.lock for $$crate is out of date. Update it with:" && \
+		  echo "  cd $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate && cargo metadata --format-version 1 > /dev/null" && \
+		  echo "then commit the updated Cargo.lock." && exit 1 ); \
+	done
+
 .PHONY: clippy_test_vectors # Run clippy lints on the test vectors app
 clippy_test_vectors: install_rs_check_toolchain
 	cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -592,7 +608,7 @@ clippy_test_vectors: install_rs_check_toolchain
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
 clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq

 .PHONY: clippy_fast # Run main clippy targets
@@ -1270,6 +1286,11 @@ test_versionable:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		--all-targets -p tfhe-versionable

+.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
+test_safe_serialize:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--all-targets -p tfhe-safe-serialize
+
 # The backward compat data folder holds historical binary data but also rust code to generate and load them.
 .PHONY: gen_backward_compat_data # Re-generate backward compatibility data
 gen_backward_compat_data:
@@ -1924,25 +1945,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
-bench_hlapi_erc20: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
+bench_hlapi_erc7984: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
-bench_hlapi_erc20_gpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
+bench_hlapi_erc7984_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
-bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
+bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
@@ -1966,13 +1987,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
-bench_hlapi_erc20_hpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc7984_hpu: install_rs_check_toolchain
 	source ./setup_hpu.sh --config $(HPU_CONFIG); \
 	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
@@ -2028,10 +2049,10 @@ bench_summary: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'

 	# DEX
@@ -2073,10 +2094,10 @@ bench_summary_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'

 	# DEX
@@ -2255,6 +2276,7 @@ pcc_batch_5:
 	$(call run_recipe_with_details,clippy_tfhe_lints)
 	$(call run_recipe_with_details,check_compile_tests)
 	$(call run_recipe_with_details,clippy_backward_compat_data)
+	$(call run_recipe_with_details,check_backward_compat_locks_did_not_change)

 .PHONY: pcc_batch_6  # duration: 6'32''
 pcc_batch_6:
@@ -2266,6 +2288,7 @@ pcc_batch_6:
 	$(call run_recipe_with_details,clippy_zk_pok_wasm)
 	$(call run_recipe_with_details,clippy_trivium)
 	$(call run_recipe_with_details,clippy_versionable)
+	$(call run_recipe_with_details,clippy_safe_serialize)
 	$(call run_recipe_with_details,clippy_param_dedup)
 	$(call run_recipe_with_details,docs)

--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.14.0"
+version = "0.15.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -1,5 +1,14 @@
 use std::path::PathBuf;
-use std::process::Command;
+
+fn get_linux_distribution_name() -> Option<String> {
+    let content = std::fs::read_to_string("/etc/os-release").ok()?;
+    for line in content.lines() {
+        if let Some(value) = line.strip_prefix("NAME=") {
+            return Some(value.trim_matches('"').to_string());
+        }
+    }
+    None
+}

 fn main() {
    if let Ok(val) = std::env::var("DOCS_RS") {
@@ -28,9 +37,7 @@ fn main() {
    println!("cargo::rerun-if-changed=src");

    if std::env::consts::OS == "linux" {
-        let output = Command::new("./get_os_name.sh").output().unwrap();
-        let distribution = String::from_utf8(output.stdout).unwrap();
-        if distribution != "Ubuntu\n" {
+        if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
            println!(
                "cargo:warning=This Linux distribution is not officially supported. \
                Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
@@ -4,22 +4,18 @@

 extern "C" {
 uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism);

 uint64_t scratch_cuda_integer_aes_ctr_256_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism);

 void cuda_integer_aes_ctr_encrypt_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
@@ -34,12 +30,10 @@ void cleanup_cuda_integer_aes_ctr_256_encrypt_64(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_key_expansion_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_key_expansion_64_async(CudaStreamsFFI streams,
                                         CudaRadixCiphertextFFI *expanded_keys,
@@ -57,12 +51,10 @@ void cuda_integer_aes_ctr_256_encrypt_64_async(
    int8_t *mem_ptr, void *const *bsks, void *const *ksks);

 uint64_t scratch_cuda_integer_key_expansion_256_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_key_expansion_256_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *expanded_keys,
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -36,5 +36,19 @@ void cuda_glwe_sample_extract_128_async(
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
    uint32_t glwe_dimension, uint32_t polynomial_size);
+
+void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
+                                            void *lwe_array_out,
+                                            void *lwe_array_in, uint32_t size,
+                                            uint32_t log_modulus,
+                                            uint32_t degree,
+                                            uint32_t grouping_factor);
+
+void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
+                                             void *lwe_array_out,
+                                             void *lwe_array_in, uint32_t size,
+                                             uint32_t log_modulus,
+                                             uint32_t degree,
+                                             uint32_t grouping_factor);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -17,10 +17,9 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
-    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_decompress,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t num_blocks_to_decompress,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_compress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -105,22 +105,32 @@ typedef struct {
  uint32_t polynomial_size;
 } CudaPackedGlweCiphertextListFFI;

+// FFI-boundary parameter struct for a LWE bootstrap key.
+// All fields are plain uint32_t for safe Rust/C++ interop.
+// Use crypto_params() (defined below) to obtain the strongly-typed C++ form.
+typedef struct {
+  uint32_t input_lwe_dimension;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+  uint32_t base_log;
+  uint32_t level_count;
+  uint32_t big_lwe_dimension;
+  uint32_t pbs_type;
+  uint32_t grouping_factor;
+} CudaLweBootstrapKeyParamsFFI;
+
 uint64_t scratch_cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, uint64_t lut_degree,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 uint64_t scratch_cuda_apply_many_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_many_lut, uint64_t lut_degree,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 void cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
@@ -139,12 +149,10 @@ void cuda_apply_many_univariate_lut_64_async(
    uint32_t lut_stride);

 uint64_t scratch_cuda_full_propagation_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_full_propagation_64_inplace_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
@@ -162,11 +170,9 @@ void cuda_integer_mult_inplace_64_async(
 uint64_t scratch_cuda_integer_mult_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
    bool const is_boolean_right, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
-    uint32_t num_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t carry_modulus, CudaLweBootstrapKeyParamsFFI bsk_params,
+    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cleanup_cuda_integer_mult_inplace_64(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);
@@ -183,12 +189,10 @@ void cuda_scalar_addition_ciphertext_64_inplace(
    uint32_t message_modulus, uint32_t carry_modulus);

 uint64_t scratch_cuda_logical_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_logical_scalar_shift_64_inplace_async(
@@ -196,12 +200,10 @@ void cuda_logical_scalar_shift_64_inplace_async(
    int8_t *mem_ptr, void *const *bsks, void *const *ksks);

 uint64_t scratch_cuda_arithmetic_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_arithmetic_scalar_shift_64_inplace_async(
@@ -215,12 +217,10 @@ void cleanup_cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_shift_and_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_shift_and_rotate_64_inplace_async(
@@ -232,22 +232,18 @@ void cleanup_cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, COMPARISON_TYPE op_type,
    bool is_signed, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_integer_scalar_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, COMPARISON_TYPE op_type,
    bool is_signed, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -276,12 +272,10 @@ void cuda_boolean_bitop_inplace_64_async(
    void *const *bsks, void *const *ksks);

 uint64_t scratch_cuda_boolean_bitop_inplace_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -289,11 +283,9 @@ void cleanup_cuda_boolean_bitop_inplace_64(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

 uint64_t scratch_cuda_boolean_bitnot_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
    uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -311,6 +303,20 @@ void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
                               uint32_t param_message_modulus,
                               uint32_t param_carry_modulus);

+uint64_t scratch_cuda_integer_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
 void cuda_integer_bitop_inplace_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
@@ -322,38 +328,20 @@ void cuda_integer_scalar_bitop_inplace_64_async(
    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks);

-uint64_t scratch_cuda_integer_bitop_inplace_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
-
 void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
-
 void cleanup_cuda_integer_scalar_bitop_inplace_64(CudaStreamsFFI streams,
                                                  int8_t **mem_ptr_void);

-uint64_t scratch_cuda_cmux_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+uint64_t scratch_cuda_cmux_64_async(CudaStreamsFFI streams, int8_t **mem_ptr,
+                                    CudaLweBootstrapKeyParamsFFI bsk_params,
+                                    uint32_t ks_level, uint32_t ks_base_log,
+                                    uint32_t lwe_ciphertext_count,
+                                    uint32_t message_modulus,
+                                    uint32_t carry_modulus,
+                                    bool allocate_gpu_memory,
+                                    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cmux_64_async(CudaStreamsFFI streams,
                        CudaRadixCiphertextFFI *lwe_array_out,
@@ -365,12 +353,10 @@ void cuda_cmux_64_async(CudaStreamsFFI streams,
 void cleanup_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_scalar_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_scalar_rotate_64_inplace_async(CudaStreamsFFI streams,
@@ -382,21 +368,17 @@ void cleanup_cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

 uint64_t scratch_cuda_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_propagate_single_carry_64_inplace_async(
@@ -418,12 +400,10 @@ void cleanup_cuda_add_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_overflowing_sub_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t compute_overflow, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_overflowing_sub_64_inplace_async(
@@ -438,14 +418,12 @@ void cleanup_cuda_integer_overflowing_sub_64_inplace(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_in_radix,
    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t carry_modulus, bool reduce_degrees_for_single_carry_propagation,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_partial_sum_ciphertexts_vec_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
@@ -456,12 +434,11 @@ void cleanup_cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_scalar_mul_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_scalar_mul_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
@@ -474,11 +451,9 @@ void cleanup_cuda_integer_scalar_mul_64(CudaStreamsFFI streams,

 uint64_t scratch_cuda_integer_div_rem_64_async(
    CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_div_rem_64_async(CudaStreamsFFI streams,
@@ -497,11 +472,9 @@ void cuda_integer_reverse_blocks_64_inplace_async(

 uint64_t scratch_cuda_integer_abs_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_abs_inplace_64_async(CudaStreamsFFI streams,
@@ -513,12 +486,10 @@ void cleanup_cuda_integer_abs_inplace_64(CudaStreamsFFI streams,
                                         int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_are_all_comparisons_block_true_64_async(
@@ -530,12 +501,10 @@ void cleanup_cuda_integer_are_all_comparisons_block_true_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_is_at_least_one_comparisons_block_true_64_async(
@@ -559,13 +528,11 @@ void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
                              CudaStreamsFFI streams);

 uint64_t scratch_cuda_apply_noise_squashing_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t num_original_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t num_original_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_apply_noise_squashing_async(
@@ -577,12 +544,10 @@ void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
                                        int8_t **mem_ptr_void);

 uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_sub_and_propagate_single_carry_64_inplace_async(
@@ -595,13 +560,11 @@ void cleanup_cuda_sub_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_unsigned_scalar_div_radix_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
@@ -612,13 +575,11 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_signed_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_signed_scalar_div_radix_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
@@ -629,12 +590,10 @@ void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_unsigned_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -651,12 +610,10 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_signed_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -672,12 +629,11 @@ void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(CudaStreamsFFI streams,
                                                         int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_count_of_consecutive_bits_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t counter_num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    Direction direction, BitValue bit_value, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t counter_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, Direction direction,
+    BitValue bit_value, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_count_of_consecutive_bits_64_async(
@@ -689,13 +645,12 @@ void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_grouped_oprf_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_process,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_to_process,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t total_random_bits,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_grouped_oprf_64_async(CudaStreamsFFI streams,
                                        CudaRadixCiphertextFFI *radix_lwe_out,
@@ -707,31 +662,28 @@ void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_grouped_oprf_custom_range_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_intermediate,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t num_input_random_bits, uint32_t num_scalar_bits,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_intermediate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t num_input_random_bits,
+    uint32_t num_scalar_bits, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_grouped_oprf_custom_range_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    uint32_t num_blocks_intermediate, const void *seeded_lwe_input,
    const uint64_t *decomposed_scalar, const uint64_t *has_at_least_one_set,
    uint32_t num_scalars, uint32_t shift, int8_t *mem, void *const *bsks,
-    void *const *ksks);
+    void *const *compute_bsks, void *const *ksks);

 void cleanup_cuda_integer_grouped_oprf_custom_range_64(CudaStreamsFFI streams,
                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_ilog2_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t input_num_blocks, uint32_t counter_num_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    uint32_t input_num_blocks, uint32_t counter_num_blocks,
    uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -747,14 +699,12 @@ void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
                                   int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_match_value_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_matches, uint32_t num_input_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_match_value_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_result,
@@ -767,13 +717,11 @@ void cleanup_cuda_unchecked_match_value_64(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

 uint64_t scratch_cuda_cast_to_unsigned_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_input_blocks, uint32_t target_num_blocks, bool input_is_signed,
-    bool requires_full_propagate, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    bool input_is_signed, bool requires_full_propagate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cast_to_unsigned_64_async(CudaStreamsFFI streams,
@@ -787,14 +735,12 @@ void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_match_value_or_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_matches, uint32_t num_input_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
    uint32_t max_output_is_zero, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_match_value_or_64_async(
@@ -808,12 +754,10 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_contains_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
@@ -828,12 +772,10 @@ void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
                                        int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_contains_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_clear_64_async(
@@ -846,12 +788,10 @@ void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_is_in_clears_64_async(CudaStreamsFFI streams,
@@ -866,12 +806,10 @@ void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
                                            int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_in_clears_64_async(
@@ -885,12 +823,10 @@ void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
                                               int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_unique, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_in_clears_64_async(
@@ -908,12 +844,10 @@ void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_of_clear_64_async(
@@ -927,12 +861,10 @@ void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
                                                    int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_first_index_of_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_of_64_async(
@@ -946,12 +878,10 @@ void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_index_of_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
@@ -967,12 +897,10 @@ void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
                                        int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_of_clear_64_async(
@@ -987,12 +915,10 @@ void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_all_eq_slices_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_all_eq_slices_64_async(
@@ -1005,12 +931,10 @@ void cleanup_cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
                                             int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_contains_sub_slice_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_lhs, uint32_t num_rhs, uint32_t num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_lhs, uint32_t num_rhs,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_sub_slice_64_async(
@@ -1023,12 +947,10 @@ void cleanup_cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
                                                  int8_t **mem_ptr_void);

 uint64_t scratch_cuda_cast_to_signed_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_input_blocks,
-    uint32_t target_num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool input_is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool input_is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cast_to_signed_64_async(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -345,6 +345,21 @@ struct int_radix_params {
        message_modulus(message_modulus), carry_modulus(carry_modulus),
        noise_reduction_type(noise_reduction_type){};

+  int_radix_params(CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+                   uint32_t ks_base_log, uint32_t message_modulus,
+                   uint32_t carry_modulus,
+                   PBS_MS_REDUCTION_T noise_reduction_type)
+      : pbs_type((PBS_TYPE)bsk_params.pbs_type),
+        glwe_dimension(bsk_params.glwe_dimension),
+        polynomial_size(bsk_params.polynomial_size),
+        big_lwe_dimension(bsk_params.big_lwe_dimension),
+        small_lwe_dimension(bsk_params.input_lwe_dimension), ks_level(ks_level),
+        ks_base_log(ks_base_log), pbs_level(bsk_params.level_count),
+        pbs_base_log(bsk_params.base_log),
+        grouping_factor(bsk_params.grouping_factor),
+        message_modulus(message_modulus), carry_modulus(carry_modulus),
+        noise_reduction_type(noise_reduction_type){};
+
  int_radix_params() = default;

  void print() {
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
@@ -5,12 +5,11 @@

 extern "C" {
 uint64_t scratch_cuda_kreyvium_generate_keystream_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_inputs);

 void cuda_kreyvium_generate_keystream_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -39,6 +39,28 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
                                                      uint32_t gpu_index,
                                                      int8_t **pbs_buffer);

+// Noise-tests-namespaced wrappers for scratch/cleanup, so that callers
+// working with the noise-tests PBS variant use a consistent naming scheme.
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
+
+// Noise tests variant: 64-bit torus, polynomial_size=2048 only. Uses the
+// NOISE_TESTS keybundle mode for noise analysis purposes.
+void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
@@ -56,6 +78,23 @@ void cuda_multi_bit_programmable_bootstrap_128_async(
 void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
                                                       const uint32_t gpu_index,
                                                       int8_t **buffer);
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
+
+void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
@@ -5,12 +5,11 @@

 extern "C" {
 uint64_t scratch_cuda_trivium_generate_keystream_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_inputs);

 void cuda_trivium_generate_keystream_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
  uint32_t num_lwes;
  uint32_t num_compact_lists;

-  int_radix_lut<Torus> *message_and_carry_extract_luts;
-  int_radix_lut<Torus> *identity_lut;
+  int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
+  int_radix_lut<Torus> *identity_lut = nullptr;

-  Torus *tmp_expanded_lwes;
-  Torus *tmp_ksed_small_to_big_expanded_lwes;
+  Torus *tmp_expanded_lwes = nullptr;
+  Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;

  bool gpu_memory_allocated;

@@ -148,66 +148,6 @@ template <typename Torus> struct zk_expand_mem {
      PANIC("GPU backend requires carry_modulus equal to message_modulus")
    }

-    // We create the identity LUT only if we are doing a SANITY_CHECK
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut =
-          new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
-                                   allocate_gpu_memory, size_tracker);
-
-      auto identity_lut_f = [](Torus x) -> Torus { return x; };
-
-      identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
-                                               LUT_0_FOR_ALL_BLOCKS);
-    }
-
-    auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
-      return x % casting_params.message_modulus;
-    };
-    auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
-      return (x / casting_params.carry_modulus) %
-             casting_params.message_modulus;
-    };
-
-    // Booleans have to be sanitized
-    auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
-    auto message_extract_and_sanitize_bool_lut_f =
-        [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-      return sanitize_bool_f(message_extract_lut_f(x));
-    };
-    auto carry_extract_and_sanitize_bool_lut_f =
-        [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-      return sanitize_bool_f(carry_extract_lut_f(x));
-    };
-
-    /** In case the casting key casts from BIG to SMALL key we run a single KS
-    to expand using the casting key as ksk. Otherwise, in case the casting key
-    casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
-    the casting key as ksk, then we keyswitch from BIG to SMALL using the
-    computing ksk, and lastly we apply the PBS. The output is always on the
-    BIG key.
-    **/
-    auto params = casting_params;
-    if (casting_key_type == SMALL_TO_BIG) {
-      params = computing_params;
-    }
-    message_and_carry_extract_luts = new int_radix_lut<Torus>(
-        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
-
-    // We are always packing two LWEs. We just need to be sure we have enough
-    // space in the carry part to store a message of the same size as is in the
-    // message part.
-    if (params.carry_modulus < params.message_modulus)
-      PANIC("Carry modulus must be at least as large as message modulus");
-    auto num_packed_msgs = 2;
-
-    // Adjust indexes to permute the output and access the correct LUT
-    auto h_indexes_in = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-    auto h_indexes_out = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-    auto h_lut_indexes = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-
    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
@@ -216,144 +156,202 @@ template <typename Torus> struct zk_expand_mem {
    h_expand_jobs = static_cast<expand_job<Torus> *>(
        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));

-    /*
-     * Each LWE contains encrypted data in both carry and message spaces
-     * that needs to be extracted.
-     *
-     * The loop processes each compact list (k) and for each LWE within that
-     * list:
-     * 1. Sets input indexes to read each LWE twice (for carry and message
-     * extraction)
-     * 2. Creates output indexes to properly reorder the results
-     * 3. Selects appropriate LUT index based on whether boolean sanitization is
-     * needed
-     *
-     * We want the output to have always first the content of the message part
-     * and then the content of the carry part of each LWE.
-     *
-     * i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
-     * carry_extract(LWE_1), ...
-     *
-     * Aiming that behavior, with 4 LWEs we would have:
-     *
-     * // Each LWE is processed twice
-     * h_indexes_in   = {0, 1, 2, 3, 0, 1, 2, 3}
-     *
-     * // First 4 use message LUT, last 4 use carry LUT
-     * h_lut_indexes  = {0, 0, 0, 0, 1, 1, 1, 1}
-     *
-     * // Reorders output so message and carry for each LWE appear together
-     * h_indexes_out  = {0, 2, 4, 6, 1, 3, 5, 7}
-     *
-     * If an LWE contains a boolean value, its LUT index is shifted by
-     * num_packed_msgs to use the sanitization LUT (which ensures output is
-     * exactly 0 or 1).
-     */
-    auto offset = 0;
-    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
-      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
-        auto lwe_index = i + num_packed_msgs * offset;
-        auto lwe_index_in_list = i % num_lwes_in_kth;
-        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
-                       "Cuda error: index %d is beyond the max value %d",
-                       lwe_index, num_packed_msgs * num_lwes);
-        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
-        h_indexes_out[lwe_index] =
-            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %lu is beyond the max value %lu",
-                       (unsigned long)h_indexes_in[lwe_index],
-                       (unsigned long)(num_packed_msgs * num_lwes));
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %lu is beyond the max value %lu",
-                       (unsigned long)h_indexes_out[lwe_index],
-                       (unsigned long)(num_packed_msgs * num_lwes));
-        // is_boolean_array tells us which input is a boolean and thus the
-        // related output needs boolean sanitization. It naturally has
-        // total_blocks entries, but h_indexes_out reaches
-        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
-        // the ceiling causes out-of-bounds access. Reading garbage "true" would
-        // set h_lut_indexes to an invalid index pointing to uninitialized
-        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
-        // to match.
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
-                       "Cuda error: index %lu for is_boolean_array is out of "
-                       "bounds (len is %lu)",
-                       (unsigned long)h_indexes_out[lwe_index],
-                       (unsigned long)is_boolean_array_len);
+    // NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
+    // no intermediate buffers needed.
+    if (expand_kind != EXPAND_KIND::NO_CASTING) {
+      /** In case the casting key casts from BIG to SMALL key we run a single KS
+      to expand using the casting key as ksk. Otherwise, in case the casting key
+      casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
+      the casting key as ksk, then we keyswitch from BIG to SMALL using the
+      computing ksk, and lastly we apply the PBS. The output is always on the
+      BIG key.
+      **/
+      auto params = casting_params;
+      if (casting_key_type == SMALL_TO_BIG) {
+        params = computing_params;
      }
-      offset += num_lwes_in_kth;
-    }

-    message_and_carry_extract_luts->set_lwe_indexes(
-        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
+      // We always pack two LWEs (message and carry parts per LWE)
+      auto num_packed_msgs = 2;

-    auto active_streams =
-        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
+      // Adjust indexes to permute the output and access the correct LUT.
+      //
+      // The loop below fills h_indexes_in and h_indexes_out so that the output
+      // is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
+      // msg_extract(LWE_1), carry_extract(LWE_1), ...
+      //
+      // With 4 LWEs the arrays look like:
+      //   h_indexes_in  = {0, 1, 2, 3, 0, 1, 2, 3}  (each LWE read twice)
+      //   h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}  (msg LUT then carry LUT)
+      //   h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}  (interleaved output)
+      //
+      // If an LWE contains a boolean its LUT index is shifted by
+      // num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
+      auto h_indexes_in = static_cast<Torus *>(
+          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+      auto h_indexes_out = static_cast<Torus *>(
+          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));

-    // Index generator for message/carry extraction LUTs
-    auto index_gen = [num_compact_lists,
-                      num_lwes_per_compact_list =
-                          this->num_lwes_per_compact_list,
-                      num_packed_msgs, is_boolean_array,
-                      h_indexes_out](Torus *h_lut_indexes, uint32_t) {
      auto offset = 0;
      for (int k = 0; k < num_compact_lists; k++) {
-        auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+        auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
        for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
          auto lwe_index = i + num_packed_msgs * offset;
-          auto boolean_offset =
-              is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
-          h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+          auto lwe_index_in_list = i % num_lwes_in_kth;
+          PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                         "Cuda error: index %d is beyond the max value %d",
+                         lwe_index, num_packed_msgs * num_lwes);
+          h_indexes_in[lwe_index] = lwe_index_in_list + offset;
+          h_indexes_out[lwe_index] =
+              num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
+          PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                         "Cuda error: index %lu is beyond the max value %lu",
+                         (unsigned long)h_indexes_in[lwe_index],
+                         (unsigned long)(num_packed_msgs * num_lwes));
+          PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                         "Cuda error: index %lu is beyond the max value %lu",
+                         (unsigned long)h_indexes_out[lwe_index],
+                         (unsigned long)(num_packed_msgs * num_lwes));
+          // is_boolean_array tells us which input is a boolean and thus the
+          // related output needs boolean sanitization. It naturally has
+          // total_blocks entries, but h_indexes_out reaches
+          // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
+          // odd, the ceiling causes out-of-bounds access. Reading garbage
+          // "true" would set h_lut_indexes to an invalid index pointing to
+          // uninitialized memory instead of a real LUT. Rust pads
+          // is_boolean_array with FALSE to match.
+          PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                         "Cuda error: index %lu for is_boolean_array is out of "
+                         "bounds (len is %lu)",
+                         (unsigned long)h_indexes_out[lwe_index],
+                         (unsigned long)is_boolean_array_len);
        }
        offset += num_lwes_in_kth;
      }
-    };

-    message_and_carry_extract_luts->generate_and_broadcast_lut(
-        active_streams, {0, 1, 2, 3},
-        {message_extract_lut_f, carry_extract_lut_f,
-         message_extract_and_sanitize_bool_lut_f,
-         carry_extract_and_sanitize_bool_lut_f},
-        index_gen, true, {}, h_lut_indexes);
+      auto active_streams =
+          streams.active_gpu_subset(2 * num_lwes, params.pbs_type);

-    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
-        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
-    // The expanded LWEs will always be on the casting key format
-    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
-        streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
+      // SANITY_CHECK uses identity_lut (skipping the full message/carry
+      // extraction LUT and the SMALL_TO_BIG intermediate buffer).
+      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        identity_lut =
+            new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
+                                     allocate_gpu_memory, size_tracker);

-    tmp_ksed_small_to_big_expanded_lwes =
-        (Torus *)cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<Torus>(num_lwes,
-                                   casting_params.big_lwe_dimension + 1),
-            streams.stream(0), streams.gpu_index(0), size_tracker,
-            allocate_gpu_memory);
+        auto identity_lut_f = [](Torus x) -> Torus { return x; };
+        identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
+                                                 LUT_0_FOR_ALL_BLOCKS);
+        identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
+                                      h_indexes_in, h_indexes_out);
+        identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
+            active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
+      } else {
+        // We are always packing two LWEs. We just need to be sure we have
+        // enough space in the carry part to store a message of the same size
+        // as is in the message part.
+        if (params.carry_modulus < params.message_modulus)
+          PANIC("Carry modulus must be at least as large as message modulus");
+
+        message_and_carry_extract_luts =
+            new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
+                                     allocate_gpu_memory, size_tracker);
+        message_and_carry_extract_luts->set_lwe_indexes(
+            streams.stream(0), streams.gpu_index(0), h_indexes_in,
+            h_indexes_out);
+
+        auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
+          return x % casting_params.message_modulus;
+        };
+        auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
+          return (x / casting_params.carry_modulus) %
+                 casting_params.message_modulus;
+        };
+        auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
+        auto message_extract_and_sanitize_bool_lut_f =
+            [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+          return sanitize_bool_f(message_extract_lut_f(x));
+        };
+        auto carry_extract_and_sanitize_bool_lut_f =
+            [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+          return sanitize_bool_f(carry_extract_lut_f(x));
+        };
+
+        auto h_lut_indexes = static_cast<Torus *>(
+            malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+
+        auto index_gen = [num_compact_lists,
+                          num_lwes_per_compact_list =
+                              this->num_lwes_per_compact_list,
+                          num_packed_msgs, is_boolean_array,
+                          h_indexes_out](Torus *h_lut_indexes, uint32_t) {
+          auto offset = 0;
+          for (int k = 0; k < num_compact_lists; k++) {
+            auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+            for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+              auto lwe_index = i + num_packed_msgs * offset;
+              auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
+                                        ? num_packed_msgs
+                                        : 0;
+              h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+            }
+            offset += num_lwes_in_kth;
+          }
+        };
+
+        message_and_carry_extract_luts->generate_and_broadcast_lut(
+            active_streams, {0, 1, 2, 3},
+            {message_extract_lut_f, carry_extract_lut_f,
+             message_extract_and_sanitize_bool_lut_f,
+             carry_extract_and_sanitize_bool_lut_f},
+            index_gen, true, {}, h_lut_indexes);
+        message_and_carry_extract_luts
+            ->allocate_lwe_vector_for_non_trivial_indexes(
+                active_streams, 2 * num_lwes, size_tracker,
+                allocate_gpu_memory);
+        free(h_lut_indexes);
+
+        // SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
+        // on the full casting path.
+        tmp_ksed_small_to_big_expanded_lwes =
+            (Torus *)cuda_malloc_with_size_tracking_async(
+                safe_mul_sizeof<Torus>(num_lwes,
+                                       casting_params.big_lwe_dimension + 1),
+                streams.stream(0), streams.gpu_index(0), size_tracker,
+                allocate_gpu_memory);
+      }
+
+      // The expanded LWEs will always be on the casting key format
+      tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
+          safe_mul_sizeof<Torus>(num_lwes,
+                                 casting_params.big_lwe_dimension + 1),
+          streams.stream(0), streams.gpu_index(0), size_tracker,
+          allocate_gpu_memory);
+
+      free(h_indexes_in);
+      free(h_indexes_out);
+    }

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-    free(h_indexes_in);
-    free(h_indexes_out);
-    free(h_lut_indexes);
  }

  void release(CudaStreams streams) {
-    message_and_carry_extract_luts->release(streams);
-    delete message_and_carry_extract_luts;
-
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut->release(streams);
-      delete identity_lut;
+    if (expand_kind != EXPAND_KIND::NO_CASTING) {
+      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        identity_lut->release(streams);
+        delete identity_lut;
+      } else {
+        message_and_carry_extract_luts->release(streams);
+        delete message_and_carry_extract_luts;
+        cuda_drop_with_size_tracking_async(
+            tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
+            streams.gpu_index(0), gpu_memory_allocated);
+      }
+      cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
+                                         streams.gpu_index(0),
+                                         gpu_memory_allocated);
    }

-    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
-                                       streams.stream(0), streams.gpu_index(0),
-                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
@@ -2,19 +2,14 @@
 #include "aes.cuh"

 uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_aes_encrypt<uint64_t>(
      CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
@@ -22,19 +17,14 @@ uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
 }

 uint64_t scratch_cuda_integer_aes_ctr_256_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_aes_encrypt<uint64_t>(
      CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
@@ -78,18 +68,13 @@ void cleanup_cuda_integer_aes_ctr_256_encrypt_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_key_expansion_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_key_expansion<uint64_t>(
      CudaStreams(streams), (int_key_expansion_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
@@ -390,7 +390,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_a[6], &wires_a[15], &input_bits[7]);
  XOR(&wires_a[10], &wires_a[15], &wires_b[0]);
  XOR(&wires_a[11], &wires_a[20], &wires_a[9]);
-  FLUSH(&wires_a[6], &wires_a[10]);
+  FLUSH(&wires_a[6], &wires_a[10], &wires_a[11]);
  XOR(&wires_a[7], &input_bits[7], &wires_a[11]);
  FLUSH(&wires_a[7]);
  XOR(&wires_a[17], &wires_a[10], &wires_a[11]);
@@ -426,7 +426,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[22], &wires_b[18], &wires_a[19]);
  XOR(&wires_b[23], &wires_b[19], &wires_a[21]);
  XOR(&wires_b[24], &wires_b[20], &wires_a[18]);
-  FLUSH(&wires_b[21], &wires_b[23], &wires_b[24]);
+  FLUSH(&wires_b[21], &wires_b[22], &wires_b[23], &wires_b[24]);
  XOR(&wires_b[25], &wires_b[21], &wires_b[22]);
  FLUSH(&wires_b[25]);

@@ -468,7 +468,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,

  XOR(&wires_b[37], &wires_b[36], &wires_b[34]);
  XOR(&wires_b[38], &wires_b[27], &wires_b[36]);
-  FLUSH(&wires_b[38]);
+  FLUSH(&wires_b[38], &wires_b[37]);
  XOR(&wires_b[44], &wires_b[33], &wires_b[37]);

  CudaRadixCiphertextFFI *and_outs_6[] = {&wires_b[39]};
@@ -479,7 +479,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[40], &wires_b[25], &wires_b[39]);
  XOR(&wires_b[41], &wires_b[40], &wires_b[37]);
  XOR(&wires_b[43], &wires_b[29], &wires_b[40]);
-  FLUSH(&wires_b[41]);
+  FLUSH(&wires_b[41], &wires_b[40], &wires_b[43], &wires_b[44]);
  XOR(&wires_b[45], &wires_b[42], &wires_b[41]);
  FLUSH(&wires_b[45]);

@@ -514,6 +514,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[57], &wires_b[50], &wires_b[53]);
  XOR(&wires_b[58], &wires_c[4], &wires_b[46]);
  XOR(&wires_b[59], &wires_c[3], &wires_b[54]);
+  FLUSH(&wires_b[57], &wires_b[58]);
  XOR(&wires_b[60], &wires_b[46], &wires_b[57]);
  XOR(&wires_b[61], &wires_c[14], &wires_b[57]);
  XOR(&wires_b[62], &wires_b[52], &wires_b[58]);
@@ -589,6 +590,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
 #undef FLUSH
 #undef AND
 #undef ADD_ONE_FLUSH
+#undef ADD_ONE
 }

 /**
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes256.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes256.cu
@@ -14,18 +14,13 @@ void cuda_integer_aes_ctr_256_encrypt_64_async(
 }

 uint64_t scratch_cuda_integer_key_expansion_256_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_key_expansion_256<uint64_t>(
      CudaStreams(streams), (int_key_expansion_256_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -150,3 +150,31 @@ void cuda_glwe_sample_extract_128_async(
          "N's are powers of two in the interval [256..4096].")
  }
 }
+
+void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
+                                            void *lwe_array_out,
+                                            void *lwe_array_in, uint32_t size,
+                                            uint32_t log_modulus,
+                                            uint32_t degree,
+                                            uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
+
+void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
+                                             void *lwe_array_out,
+                                             void *lwe_array_in, uint32_t size,
+                                             uint32_t log_modulus,
+                                             uint32_t degree,
+                                             uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<__uint128_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -463,5 +463,48 @@ __global__ void __launch_bounds__(512)
      return;
  }
 }
+// This function is only used for noise tests, it follows the same logic
+// that is embedded in the keybundle just we need a global function to
+// be able to test it individually.
+template <typename Torus, class params>
+__global__ void
+modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
+                         uint32_t log_modulus, uint32_t grouping_factor) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    int num_monomials = 1 << grouping_factor;
+    int input_offset = tid * grouping_factor;
+    int output_offset = tid * num_monomials;
+    // We calculate all monomials even if the first one is never used.
+    for (int ggsw_idx = 0; ggsw_idx < num_monomials; ggsw_idx++) {
+      array_out[ggsw_idx + output_offset] =
+          calculates_monomial_degree<Torus, params>(&array_in[input_offset],
+                                                    ggsw_idx, grouping_factor);
+    }
+  }
+}
+// This aims to be launched only from the noise tests.
+//  That is why we support a specific set of parameters
+template <typename Torus>
+__host__ void host_modulus_switch_multi_bit(
+    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
+    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  int multibit_size = size / grouping_factor;
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
+  switch (degree) {
+  case 2048:
+    modulus_switch_multi_bit<Torus, Degree<2048>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [2048].")
+  };
+
+  check_cuda_error(cudaGetLastError());
+}

 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -2,17 +2,12 @@

 uint64_t scratch_cuda_integer_abs_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_abs<uint64_t>(
      CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -11,19 +11,14 @@ void cuda_boolean_bitop_inplace_64_async(
 }

 uint64_t scratch_cuda_boolean_bitop_inplace_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_boolean_bitop<uint64_t>(
      CudaStreams(streams), (boolean_bitop_buffer<uint64_t> **)mem_ptr,
@@ -41,18 +36,13 @@ void cleanup_cuda_boolean_bitop_inplace_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_boolean_bitnot_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
    uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_boolean_bitnot<uint64_t>(
      CudaStreams(streams), (boolean_bitnot_buffer<uint64_t> **)mem_ptr, params,
@@ -78,6 +68,34 @@ void cleanup_cuda_boolean_bitnot_64(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

+uint64_t scratch_cuda_integer_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_bitop<uint64_t>(
+      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
+      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
+}
+
+uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_bitop<uint64_t>(
+      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
+      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
+}
+
 void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
                               CudaRadixCiphertextFFI *radix_ciphertext,
                               uint32_t ct_message_modulus,
@@ -99,25 +117,6 @@ void cuda_integer_bitop_inplace_64_async(
                       (uint64_t **)(ksks));
 }

-uint64_t scratch_cuda_integer_bitop_inplace_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
-
-  return scratch_cuda_bitop<uint64_t>(
-      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
-      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
-}
-
 void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void) {

@@ -128,25 +127,6 @@ void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
-
-  return scratch_cuda_bitop<uint64_t>(
-      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
-      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
-}
-
 void cleanup_cuda_integer_scalar_bitop_inplace_64(CudaStreamsFFI streams,
                                                  int8_t **mem_ptr_void) {

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -34,19 +34,14 @@ void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
 }

 uint64_t scratch_cuda_cast_to_unsigned_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_input_blocks, uint32_t target_num_blocks, bool input_is_signed,
-    bool requires_full_propagate, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    bool input_is_signed, bool requires_full_propagate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_cast_to_unsigned<uint64_t>(
      CudaStreams(streams), (int_cast_to_unsigned_buffer<uint64_t> **)mem_ptr,
@@ -80,19 +75,13 @@ void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_cast_to_signed_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_input_blocks,
-    uint32_t target_num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool input_is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool input_is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_cast_to_signed<uint64_t>(
      CudaStreams(streams), (int_cast_to_signed_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,18 +1,16 @@
 #include "integer/cmux.cuh"

-uint64_t scratch_cuda_cmux_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+uint64_t scratch_cuda_cmux_64_async(CudaStreamsFFI streams, int8_t **mem_ptr,
+                                    CudaLweBootstrapKeyParamsFFI bsk_params,
+                                    uint32_t ks_level, uint32_t ks_base_log,
+                                    uint32_t lwe_ciphertext_count,
+                                    uint32_t message_modulus,
+                                    uint32_t carry_modulus,
+                                    bool allocate_gpu_memory,
+                                    PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch cmux")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,18 +1,14 @@
 #include "integer/comparison.cuh"

 uint64_t scratch_cuda_integer_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, COMPARISON_TYPE op_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch comparison")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  uint64_t size_tracker = 0;
  switch (op_type) {
@@ -38,18 +34,14 @@ uint64_t scratch_cuda_integer_comparison_64_async(
 }

 uint64_t scratch_cuda_integer_scalar_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, COMPARISON_TYPE op_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch scalar comparison")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  uint64_t size_tracker = 0;
  switch (op_type) {
@@ -151,18 +143,13 @@ void cleanup_cuda_integer_scalar_comparison_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_comparison_check<uint64_t>(
      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
@@ -196,18 +183,13 @@ void cleanup_cuda_integer_are_all_comparisons_block_true_64(
 }

 uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_comparison_check<uint64_t>(
      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -23,22 +23,24 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
-    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_decompress,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t num_blocks_to_decompress,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  // Decompression doesn't keyswitch, so big and small dimensions are the same
  int_radix_params encryption_params(
-      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log,
-      grouping_factor, message_modulus, carry_modulus, noise_reduction_type);
+      (PBS_TYPE)bsk_params.pbs_type, encryption_glwe_dimension,
+      encryption_polynomial_size, bsk_params.big_lwe_dimension,
+      bsk_params.big_lwe_dimension, 0, 0, bsk_params.level_count,
+      bsk_params.base_log, bsk_params.grouping_factor, message_modulus,
+      carry_modulus, noise_reduction_type);

  int_radix_params compression_params(
-      pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
-      0, 0, pbs_level, pbs_base_log, grouping_factor, message_modulus,
-      carry_modulus, noise_reduction_type);
+      (PBS_TYPE)bsk_params.pbs_type, compression_glwe_dimension,
+      compression_polynomial_size, bsk_params.big_lwe_dimension,
+      compression_glwe_dimension * compression_polynomial_size, 0, 0,
+      bsk_params.level_count, bsk_params.base_log, bsk_params.grouping_factor,
+      message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
      CudaStreams(streams), (int_decompression<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -2,17 +2,13 @@

 uint64_t scratch_cuda_integer_div_rem_64_async(
    CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch div")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_div_rem<uint64_t>(
      CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
@@ -1,19 +1,14 @@
 #include "ilog2.cuh"

 uint64_t scratch_cuda_integer_count_of_consecutive_bits_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t counter_num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    Direction direction, BitValue bit_value, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t counter_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, Direction direction,
+    BitValue bit_value, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_count_of_consecutive_bits<uint64_t>(
      CudaStreams(streams), params,
@@ -53,19 +48,14 @@ void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_ilog2_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t input_num_blocks, uint32_t counter_num_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    uint32_t input_num_blocks, uint32_t counter_num_blocks,
    uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_ilog2<uint64_t>(
      CudaStreams(streams), params, (int_ilog2_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -15,17 +15,12 @@ void cuda_full_propagation_64_inplace_async(
 }

 uint64_t scratch_cuda_full_propagation_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_full_propagation<uint64_t>(
      CudaStreams(streams), (int_fullprop_buffer<uint64_t> **)mem_ptr, params,
@@ -44,17 +39,13 @@ void cleanup_cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
      CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
@@ -62,17 +53,13 @@ uint64_t scratch_cuda_propagate_single_carry_64_inplace_async(
 }

 uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
      CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
@@ -80,17 +67,13 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace_async(
 }

 uint64_t scratch_cuda_integer_overflowing_sub_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t compute_overflow, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_overflowing_sub<uint64_t>(
      CudaStreams(streams), (int_borrow_prop_memory<uint64_t> **)mem_ptr,
@@ -170,17 +153,12 @@ void cleanup_cuda_integer_overflowing_sub_64_inplace(CudaStreamsFFI streams,

 uint64_t scratch_cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint64_t lut_degree, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_apply_univariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
@@ -190,17 +168,12 @@ uint64_t scratch_cuda_apply_univariate_lut_64_async(

 uint64_t scratch_cuda_apply_many_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_many_lut, uint64_t lut_degree,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_apply_many_univariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
@@ -294,19 +267,14 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
 }

 uint64_t scratch_cuda_apply_noise_squashing_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t original_num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t original_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_apply_noise_squashing_mem(
      streams, params, (int_noise_squashing_lut<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -122,16 +122,12 @@ void cuda_integer_mult_inplace_64_async(
 uint64_t scratch_cuda_integer_mult_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
    bool const is_boolean_right, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          polynomial_size * glwe_dimension, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    uint32_t carry_modulus, CudaLweBootstrapKeyParamsFFI bsk_params,
+    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_radix_blocks,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  const uint32_t polynomial_size = bsk_params.polynomial_size;
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  switch (polynomial_size) {
  case 256:
@@ -164,20 +160,14 @@ void cleanup_cuda_integer_mult_inplace_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_in_radix,
    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    uint32_t carry_modulus, bool reduce_degrees_for_single_carry_propagation,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);
  return scratch_cuda_integer_partial_sum_ciphertexts_vec<uint64_t>(
      CudaStreams(streams),
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
@@ -1,19 +1,14 @@
 #include "integer/oprf.cuh"

 uint64_t scratch_cuda_integer_grouped_oprf_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_process,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_to_process,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t total_random_bits,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_grouped_oprf<uint64_t>(
      CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
@@ -45,20 +40,14 @@ void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_grouped_oprf_custom_range_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_intermediate,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t num_input_random_bits, uint32_t num_scalar_bits,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_intermediate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t num_input_random_bits,
+    uint32_t num_scalar_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_grouped_oprf_custom_range<uint64_t>(
      CudaStreams(streams),
@@ -72,13 +61,13 @@ void cuda_integer_grouped_oprf_custom_range_64_async(
    uint32_t num_blocks_intermediate, const void *seeded_lwe_input,
    const uint64_t *decomposed_scalar, const uint64_t *has_at_least_one_set,
    uint32_t num_scalars, uint32_t shift, int8_t *mem, void *const *bsks,
-    void *const *ksks) {
+    void *const *compute_bsks, void *const *ksks) {

  host_integer_grouped_oprf_custom_range<uint64_t>(
      CudaStreams(streams), radix_lwe_out, num_blocks_intermediate,
      (const uint64_t *)seeded_lwe_input, decomposed_scalar,
      has_at_least_one_set, num_scalars, shift,
-      (int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks,
+      (int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks, compute_bsks,
      (uint64_t *const *)ksks);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -114,7 +114,7 @@ void host_integer_grouped_oprf_custom_range(
    const Torus *decomposed_scalar, const Torus *has_at_least_one_set,
    uint32_t num_scalars, uint32_t shift,
    int_grouped_oprf_custom_range_memory<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks) {
+    void *const *compute_bsks, Torus *const *ksks) {

  CudaRadixCiphertextFFI *computation_buffer = mem_ptr->tmp_oprf_output;
  set_zero_radix_ciphertext_slice_async<Torus>(
@@ -127,12 +127,12 @@ void host_integer_grouped_oprf_custom_range(

  host_integer_scalar_mul_radix<Torus>(
      streams, computation_buffer, decomposed_scalar, has_at_least_one_set,
-      mem_ptr->scalar_mul_buffer, bsks, ksks, mem_ptr->params.message_modulus,
-      num_scalars);
+      mem_ptr->scalar_mul_buffer, compute_bsks, ksks,
+      mem_ptr->params.message_modulus, num_scalars);

-  host_logical_scalar_shift_inplace<Torus>(streams, computation_buffer, shift,
-                                           mem_ptr->logical_scalar_shift_buffer,
-                                           bsks, ksks, num_blocks_intermediate);
+  host_logical_scalar_shift_inplace<Torus>(
+      streams, computation_buffer, shift, mem_ptr->logical_scalar_shift_buffer,
+      compute_bsks, ksks, num_blocks_intermediate);

  uint32_t num_blocks_output = radix_lwe_out->num_radix_blocks;
  uint32_t blocks_to_copy =
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -1,19 +1,13 @@
 #include "scalar_div.cuh"

 uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -45,19 +39,13 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_signed_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_signed_scalar_div_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -89,20 +77,14 @@ void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_unsigned_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -143,20 +125,14 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
 }

 uint64_t scratch_cuda_integer_signed_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), params,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -1,18 +1,13 @@
 #include "integer/scalar_mul.cuh"

 uint64_t scratch_cuda_integer_scalar_mul_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_scalar_mul<uint64_t>(
      CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -1,18 +1,13 @@
 #include "scalar_rotate.cuh"

 uint64_t scratch_cuda_scalar_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_scalar_rotate<uint64_t>(
      CudaStreams(streams),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -1,18 +1,13 @@
 #include "scalar_shifts.cuh"

 uint64_t scratch_cuda_logical_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_logical_scalar_shift<uint64_t>(
      CudaStreams(streams),
@@ -35,18 +30,13 @@ void cuda_logical_scalar_shift_64_inplace_async(
 }

 uint64_t scratch_cuda_arithmetic_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_arithmetic_scalar_shift<uint64_t>(
      CudaStreams(streams),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -1,18 +1,13 @@
 #include "shift_and_rotate.cuh"

 uint64_t scratch_cuda_shift_and_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_shift_and_rotate<uint64_t>(
      CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
@@ -1,18 +1,13 @@
 #include "subtraction.cuh"

 uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
      CudaStreams(streams), (int_sub_and_propagate<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_comparison.cu
@@ -1,18 +1,13 @@
 #include "integer/vector_comparison.cuh"

 uint64_t scratch_cuda_unchecked_all_eq_slices_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_all_eq_slices<uint64_t>(
      CudaStreams(streams),
@@ -50,18 +45,13 @@ void cleanup_cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_contains_sub_slice_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_lhs, uint32_t num_rhs, uint32_t num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_lhs, uint32_t num_rhs,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_contains_sub_slice<uint64_t>(
      CudaStreams(streams),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
@@ -1,19 +1,14 @@
 #include "integer/vector_find.cuh"

 uint64_t scratch_cuda_unchecked_match_value_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_matches, uint32_t num_input_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_match_value<uint64_t>(
      CudaStreams(streams), (int_unchecked_match_buffer<uint64_t> **)mem_ptr,
@@ -56,20 +51,15 @@ void cleanup_cuda_unchecked_match_value_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_match_value_or_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_matches, uint32_t num_input_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
    uint32_t max_output_is_zero, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_match_value_or<uint64_t>(
      CudaStreams(streams),
@@ -107,18 +97,13 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_contains_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_contains<uint64_t>(
      CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
@@ -157,18 +142,13 @@ void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_contains_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_contains_clear<uint64_t>(
      CudaStreams(streams),
@@ -202,18 +182,13 @@ void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_is_in_clears<uint64_t>(
      CudaStreams(streams),
@@ -247,18 +222,13 @@ void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_index_in_clears<uint64_t>(
      CudaStreams(streams),
@@ -299,18 +269,13 @@ void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_unique, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
      CudaStreams(streams),
@@ -351,18 +316,13 @@ void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
      CudaStreams(streams),
@@ -403,18 +363,13 @@ void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_first_index_of_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_first_index_of<uint64_t>(
      CudaStreams(streams),
@@ -455,18 +410,13 @@ void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_index_of_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_index_of<uint64_t>(
      CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
@@ -508,18 +458,13 @@ void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_unchecked_index_of_clear<uint64_t>(
      CudaStreams(streams),
--- a/backends/tfhe-cuda-backend/cuda/src/kreyvium/kreyvium.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/kreyvium/kreyvium.cu
@@ -2,18 +2,14 @@
 #include "kreyvium.cuh"

 uint64_t scratch_cuda_kreyvium_generate_keystream_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_inputs) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_kreyvium_encrypt<uint64_t>(
      CudaStreams(streams), (int_kreyvium_buffer<uint64_t> **)mem_ptr, params,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -420,6 +420,39 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
  }
 }

+// Noise tests variant: identical to host_cg_multi_bit_programmable_bootstrap
+// but uses NOISE_TESTS keybundle mode.
+template <typename Torus, class params>
+__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with NOISE_TESTS mode instead of GENERIC
+    execute_compute_keybundle_noise_tests<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    execute_cg_external_product_loop<Torus, params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+        lut_stride);
+  }
+}
+
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -645,6 +645,103 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
  *buffer = nullptr;
 }

+// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
+// that callers using the noise-tests PBS variant have a consistent API.
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+  return scratch_cuda_multi_bit_programmable_bootstrap_64_async(
+      stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+      level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
+  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
+                                                   pbs_buffer);
+}
+
+// Noise tests variant of the 64-bit multi-bit PBS, restricted to
+// polynomial_size=2048. The main difference is that the input
+// is assumed to be modulus switched before bootstrapping.
+void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(num_samples == 1,
+                 "Cuda error (multi-bit PBS): num_samples (%d) should be 1",
+                 num_samples);
+
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
+  PANIC_IF_FALSE(polynomial_size == 2048,
+                 "Cuda error (multi-bit PBS noise tests): only polynomial "
+                 "size 2048 is supported, got %d.",
+                 polynomial_size);
+
+  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
+      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::TBC:
+#if CUDA_ARCH >= 900
+  {
+    host_tbc_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
+                                                          Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+  } break;
+#else
+    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
+#endif
+  case PBS_VARIANT::CG:
+    host_cg_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
+                                                         Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    host_multi_bit_programmable_bootstrap_noise_tests<uint64_t, Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
 /**
 * Computes divisors of the product of num_sms (streaming multiprocessors on the
 * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -25,7 +25,8 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
         level_count;
 }

-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename Torus, class params, sharedMemDegree SMD,
+          bool runs_noise_test = false>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -55,9 +56,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {

-    const Torus *block_lwe_array_in =
-        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
-
    double2 *keybundle = keybundle_array +
                         // select the input
                         input_idx * keybundle_size_per_input;
@@ -86,10 +84,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      if constexpr (runs_noise_test == true) {
+        // For noise tests the input array contains the input lwe but also the
+        // modswitched results. This allows to avoid changing the accumulation
+        // kernel for the noise tests since the input body will stay in the same
+        // position. The layout of the input array is the following:
+        // | input lwe     | modswitched inputs       |
+        // | lwe size      | lwe_size*grouping_factor |
+
+        // This offset allows to jump directly to the modswitched inputs,
+        // skipping the input lwe
+        const Torus modswitched_offset = lwe_dimension + 1;
+
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] *
+                              (lwe_dimension / grouping_factor) *
+                              (1 << grouping_factor) +
+                          modswitched_offset];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
+        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
+
+      } else {
+        // In production we calculate the monomial degrees on the fly, since
+        // they are not stored in the input array.
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+        monomial_degrees[threadIdx.x] =
+            calculates_monomial_degree<Torus, params>(
+                lwe_array_group, threadIdx.x, grouping_factor);
+      }
    }
    __syncthreads();

@@ -145,7 +173,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 // Then we can just calculate the offset needed to apply this coefficients, and
 // the operation transforms into a pointwise vector multiplication, avoiding to
 // perform extra instructions other than MADD
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename Torus, class params, sharedMemDegree SMD,
+          bool runs_noise_test = false>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -219,10 +248,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;

    if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      if constexpr (runs_noise_test == true) {
+        // For noise tests the input array contains the input lwe but also the
+        // modswitched results. This allows to avoid changing the accumulation
+        // kernel for the noise tests since the input body will stay in the same
+        // position. The layout of the input array is the following:
+        // | input lwe     | modswitched inputs       |
+        // | lwe size      | lwe_size*grouping_factor |
+
+        // This offset allows to jump directly to the modswitched inputs,
+        // skipping the input lwe
+        const Torus modswitched_offset = lwe_dimension + 1;
+
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] *
+                              (lwe_dimension / grouping_factor) *
+                              (1 << grouping_factor) +
+                          modswitched_offset];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
+        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
+
+      } else {
+        // In production we calculate the monomial degrees on the fly, since
+        // they are not stored in the input array.
+        const Torus *block_lwe_array_in =
+            &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
+        const Torus *lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+        monomial_degrees[threadIdx.x] =
+            calculates_monomial_degree<Torus, params>(
+                lwe_array_group, threadIdx.x, grouping_factor);
+      }
    }
    __syncthreads();

@@ -662,6 +721,7 @@ enum class MultiBitKeybundleLaunchMode {
  AUTO,
  GENERIC,
  SPECIALIZED_2_2,
+  NOISE_TESTS,
 };

 template <typename Torus, class params>
@@ -726,30 +786,65 @@ __host__ void execute_compute_keybundle_with_mode(
    bool use_specialized =
        launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
        (launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
+         can_use_specialized) ||
+        (launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS &&
         can_use_specialized);
+    bool use_noise_test_template =
+        launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS;
    if (use_specialized) {
      dim3 thds_new_keybundle(512, 1, 1);
-      check_cuda_error(cudaFuncSetAttribute(
-          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
-          cudaFuncCachePreferShared));
-      check_cuda_error(cudaGetLastError());
-      device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-          Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
-                                         3 * full_sm_keybundle, stream>>>(
-          lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-          lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
+      if (use_noise_test_template) {
+        // Set up the noise-test variant of the specialized 2_2 kernel
+        check_cuda_error(cudaFuncSetAttribute(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM, true>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            3 * full_sm_keybundle));
+        check_cuda_error(cudaFuncSetCacheConfig(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM, true>,
+            cudaFuncCachePreferShared));
+        check_cuda_error(cudaGetLastError());
+        device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+            Torus, Degree<2048>, FULLSM, true>
+            <<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
+               stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
+                         bootstrapping_key, lwe_dimension, lwe_offset,
+                         chunk_size, keybundle_size_per_input);
+      } else {
+        check_cuda_error(cudaFuncSetAttribute(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            3 * full_sm_keybundle));
+        check_cuda_error(cudaFuncSetCacheConfig(
+            device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+                Torus, Degree<2048>, FULLSM>,
+            cudaFuncCachePreferShared));
+        check_cuda_error(cudaGetLastError());
+        device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
+            Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
+                                           3 * full_sm_keybundle, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
+      }
    } else {
-      device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
-          <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
-              lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-              lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-              level_count, lwe_offset, chunk_size, keybundle_size_per_input,
-              d_mem, 0);
+      if (use_noise_test_template) {
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM,
+                                                          true>
+            <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+                lwe_array_in, lwe_input_indexes, keybundle_fft,
+                bootstrapping_key, lwe_dimension, glwe_dimension,
+                polynomial_size, grouping_factor, level_count, lwe_offset,
+                chunk_size, keybundle_size_per_input, d_mem, 0);
+      } else {
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
+            <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+                lwe_array_in, lwe_input_indexes, keybundle_fft,
+                bootstrapping_key, lwe_dimension, glwe_dimension,
+                polynomial_size, grouping_factor, level_count, lwe_offset,
+                chunk_size, keybundle_size_per_input, d_mem, 0);
+      }
    }
  }
  check_cuda_error(cudaGetLastError());
@@ -796,6 +891,20 @@ __host__ void execute_compute_keybundle_2_2_specialized(
      grouping_factor, level_count, lwe_offset,
      MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
 }
+// Used only to run noise tests
+template <typename Torus, class params>
+__host__ void execute_compute_keybundle_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
+  execute_compute_keybundle_with_mode<Torus, params>(
+      stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+      buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+      grouping_factor, level_count, lwe_offset,
+      MultiBitKeybundleLaunchMode::NOISE_TESTS);
+}

 template <typename Torus, class params, bool is_first_iter>
 __host__ void execute_step_one(
@@ -955,4 +1064,62 @@ __host__ void host_multi_bit_programmable_bootstrap(
    }
  }
 }
+
+template <typename Torus, class params>
+__host__ void host_multi_bit_programmable_bootstrap_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with NOISE_TESTS mode to enable the specialized
+    // runs_noise_test=true kernel variant for noise measurement
+    execute_compute_keybundle_with_mode<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset,
+        MultiBitKeybundleLaunchMode::NOISE_TESTS);
+    // Accumulate (same as standard path)
+    uint32_t chunk_size =
+        std::min((uint32_t)lwe_chunk_size,
+                 (lwe_dimension / grouping_factor) - lwe_offset);
+    for (uint32_t j = 0; j < chunk_size; j++) {
+      bool is_first_iter = (j + lwe_offset) == 0;
+      bool is_last_iter =
+          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+      if (is_first_iter) {
+        execute_step_one<Torus, params, true>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count);
+      } else {
+        execute_step_one<Torus, params, false>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count);
+      }
+
+      if (is_last_iter) {
+        execute_step_two<Torus, params, true>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      } else {
+        execute_step_two<Torus, params, false>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      }
+    }
+  }
+}
 #endif // MULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -293,6 +293,81 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
  *buffer = nullptr;
 }

+// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
+// that callers using the noise-tests PBS128 variant have a consistent API.
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+  return scratch_cuda_multi_bit_programmable_bootstrap_128_async(
+      stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+      level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
+  cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
+                                                    pbs_buffer);
+  cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
+}
+
+// Noise tests variant of the 128-bit multi-bit PBS, restricted to
+// polynomial_size=2048. The input is assumed to contain precomputed
+// modswitched values in the extended input array layout.
+void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(num_samples == 1,
+                 "Cuda error (multi-bit PBS): num_samples (%d) should be 1",
+                 num_samples);
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
+  PANIC_IF_FALSE(polynomial_size == 2048,
+                 "Cuda error (multi-bit PBS128 noise tests): only polynomial "
+                 "size 2048 is supported, got %d.",
+                 polynomial_size);
+
+  auto *buffer =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::CG:
+    host_cg_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
+                                                             Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    host_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
+                                                          Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
 /**
 * Computes divisors of the product of num_sms (streaming multiprocessors on the
 * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -18,7 +18,8 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
                                      (size_t)2); // accumulator
 }

-template <typename InputTorus, class params, sharedMemDegree SMD>
+template <typename InputTorus, class params, sharedMemDegree SMD,
+          bool runs_noise_test = false>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
    const InputTorus *__restrict__ lwe_array_in,
    const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
@@ -80,11 +81,35 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      auto lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] =
-          calculates_monomial_degree<InputTorus, params>(
-              lwe_array_group, threadIdx.x, grouping_factor);
+      if constexpr (runs_noise_test == true) {
+        // For noise tests the input array contains the input lwe but also the
+        // modswitched results. This allows to avoid changing the accumulation
+        // kernel for the noise tests since the input body will stay in the same
+        // position. The layout of the input array is the following:
+        // | input lwe     | modswitched inputs       |
+        // | lwe size      | lwe_size*grouping_factor |
+
+        // This offset allows to jump directly to the modswitched inputs,
+        // skipping the input lwe
+        const InputTorus modswitched_offset = lwe_dimension + 1;
+
+        const InputTorus *block_lwe_array_in_noise =
+            &lwe_array_in[lwe_input_indexes[input_idx] *
+                              (lwe_dimension / grouping_factor) *
+                              (1 << grouping_factor) +
+                          modswitched_offset];
+
+        const InputTorus *lwe_array_group =
+            block_lwe_array_in_noise +
+            rev_lwe_iteration * (1 << grouping_factor);
+        monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
+      } else {
+        auto lwe_array_group =
+            block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+        monomial_degrees[threadIdx.x] =
+            calculates_monomial_degree<InputTorus, params>(
+                lwe_array_group, threadIdx.x, grouping_factor);
+      }
    }
    __syncthreads();

@@ -588,6 +613,74 @@ __host__ void execute_compute_keybundle_128(
  check_cuda_error(cudaGetLastError());
 }

+// Used only to run noise tests: launches the keybundle kernel with the
+// runs_noise_test=true variant, which reads modswitched inputs from the
+// extended input array layout instead of computing them on-the-fly
+template <typename InputTorus, class params>
+__host__ void execute_compute_keybundle_noise_tests_128(
+    cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
+  cuda_set_device(gpu_index);
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  uint64_t chunk_size = std::min(
+      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
+
+  uint64_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2) * 4;
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+          __uint128_t>(polynomial_size);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  auto d_mem = buffer->d_mem_keybundle;
+  auto keybundle_fft = buffer->keybundle_fft;
+
+  dim3 grid_keybundle(num_samples * chunk_size,
+                      (glwe_dimension + 1) * (glwe_dimension + 1), level_count);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncCachePreferShared));
+    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
+                                                          NOSM, true>
+        <<<grid_keybundle, thds, 0, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, full_sm_keybundle);
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncCachePreferShared));
+    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
+                                                          FULLSM, true>
+        <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename InputTorus, class params, bool is_first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
@@ -1200,4 +1293,96 @@ supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
  }
 }

+// Noise tests variant: identical to
+// host_cg_multi_bit_programmable_bootstrap_128 but uses the noise-test
+// keybundle (runs_noise_test=true) instead of the standard one.
+template <typename InputTorus, class params>
+__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with the noise-test kernel variant
+    // (runs_noise_test=true) to read precomputed modswitched values
+    execute_compute_keybundle_noise_tests_128<InputTorus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    execute_cg_external_product_loop_128<InputTorus, params>(
+        stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+        lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        lwe_offset, num_many_lut, lut_stride);
+  }
+}
+
+template <typename InputTorus, class params>
+__host__ void host_multi_bit_programmable_bootstrap_noise_tests_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle with the noise-test kernel variant
+    // (runs_noise_test=true) to read precomputed modswitched values
+    execute_compute_keybundle_noise_tests_128<InputTorus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    // Accumulate (same as standard path)
+    uint64_t chunk_size =
+        std::min((uint32_t)lwe_chunk_size,
+                 (lwe_dimension / grouping_factor) - lwe_offset);
+    for (uint32_t j = 0; j < chunk_size; j++) {
+      bool is_first_iter = (j + lwe_offset) == 0;
+      bool is_last_iter =
+          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+      if (is_first_iter) {
+        execute_step_one_128<InputTorus, params, true>(
+            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            base_log, level_count);
+      } else {
+        execute_step_one_128<InputTorus, params, false>(
+            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            base_log, level_count);
+      }
+
+      if (is_last_iter) {
+        execute_step_two_128<InputTorus, params, true>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      } else {
+        execute_step_two_128<InputTorus, params, false>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      }
+    }
+  }
+}
+
 #endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -795,6 +795,40 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
      MultiBitTbcLaunchMode::SPECIALIZED_2_2);
 }

+// Noise tests variant: uses NOISE_TESTS keybundle mode for the keybundle step
+// while keeping the standard AUTO accumulate behaviour for the TBC loop.
+template <typename Torus, class params>
+__host__ void host_tbc_multi_bit_programmable_bootstrap_noise_tests(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+  cuda_set_device(gpu_index);
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Keybundle with NOISE_TESTS mode; the TBC accumulate uses AUTO as usual
+    execute_compute_keybundle_noise_tests<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    // Accumulate (unchanged from standard TBC path)
+    execute_tbc_external_product_loop<Torus, params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+        lut_stride, MultiBitTbcLaunchMode::AUTO);
+  }
+}
+
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t max_shared_memory) {
--- a/backends/tfhe-cuda-backend/cuda/src/trivium/trivium.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/trivium/trivium.cu
@@ -2,18 +2,14 @@
 #include "trivium.cuh"

 uint64_t scratch_cuda_trivium_generate_keystream_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_inputs) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_trivium_encrypt<uint64_t>(
      CudaStreams(streams), (int_trivium_buffer<uint64_t> **)mem_ptr, params,
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -119,71 +119,73 @@ __host__ void host_expand_without_verification(
      streams.stream(0), streams.gpu_index(0), true);

  if (mem_ptr->expand_kind == EXPAND_KIND::NO_CASTING) {
+    // This path is added to mimic the CPU fallback behaviour for the no_casting
+    // expand, which is needed for the noise sanity checks.
    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
                                   lwe_array_out, d_expand_jobs, num_lwes);
-    return;
-  }

-  host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
-                                 expanded_lwes, d_expand_jobs, num_lwes);
+  } else {
+    // This is our default path for the expand with casting if needed.
+    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
+                                   expanded_lwes, d_expand_jobs, num_lwes);

-  auto lwe_array_input = expanded_lwes;
-  auto ksks = casting_keys;
-  auto message_and_carry_extract_luts = mem_ptr->message_and_carry_extract_luts;
+    auto lwe_array_input = expanded_lwes;
+    auto ksks = casting_keys;
+    auto message_and_carry_extract_luts =
+        mem_ptr->message_and_carry_extract_luts;

-  auto lut = mem_ptr->message_and_carry_extract_luts;
-  if (casting_key_type == SMALL_TO_BIG) {
-    if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
+    auto lut = mem_ptr->message_and_carry_extract_luts;
+    if (casting_key_type == SMALL_TO_BIG) {
+      if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
+      }
+      // Keyswitch from small to big key if needed
+      auto ksed_small_to_big_expanded_lwes =
+          mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
+      std::vector<Torus *> lwe_trivial_indexes_vec =
+          lut->lwe_trivial_indexes_vec;
+
+      auto casting_params = mem_ptr->casting_params;
+      auto casting_output_dimension = casting_params.big_lwe_dimension;
+      auto casting_input_dimension = casting_params.small_lwe_dimension;
+      auto casting_ks_level = casting_params.ks_level;
+      auto casting_ks_base_log = casting_params.ks_base_log;
+
+      // apply keyswitch to BIG
+      execute_keyswitch_async<Torus>(
+          streams.get_ith(0), ksed_small_to_big_expanded_lwes,
+          lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
+          casting_keys, casting_input_dimension, casting_output_dimension,
+          casting_ks_base_log, casting_ks_level, num_lwes,
+          lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
+
+      // In this case, the next keyswitch will use the compute ksk
+      ksks = compute_ksks;
+      lwe_array_input = ksed_small_to_big_expanded_lwes;
    }
-    // Keyswitch from small to big key if needed
-    auto ksed_small_to_big_expanded_lwes =
-        mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
-    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-    auto casting_params = mem_ptr->casting_params;
-    auto casting_output_dimension = casting_params.big_lwe_dimension;
-    auto casting_input_dimension = casting_params.small_lwe_dimension;
-    auto casting_ks_level = casting_params.ks_level;
-    auto casting_ks_base_log = casting_params.ks_base_log;
+    // Apply LUT
+    cuda_memset_async(lwe_array_out, 0,
+                      safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
+                                             (size_t)num_lwes, (size_t)2),
+                      streams.stream(0), streams.gpu_index(0));
+    CudaRadixCiphertextFFI output;
+    into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
+    CudaRadixCiphertextFFI input;
+    into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
+    // This is a special case only for our noise sanity checks
+    // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
+    // This replicates the CPU fallback behaviour of the casting expand
+    auto final_lut = (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK
+                          ? mem_ptr->identity_lut
+                          : message_and_carry_extract_luts);

-    // apply keyswitch to BIG
-    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), ksed_small_to_big_expanded_lwes,
-        lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
-        casting_keys, casting_input_dimension, casting_output_dimension,
-        casting_ks_base_log, casting_ks_level, num_lwes,
-        lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
-
-    // In this case, the next keyswitch will use the compute ksk
-    ksks = compute_ksks;
-    lwe_array_input = ksed_small_to_big_expanded_lwes;
-  }
-
-  // Apply LUT
-  cuda_memset_async(lwe_array_out, 0,
-                    safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
-                                           (size_t)num_lwes, (size_t)2),
-                    streams.stream(0), streams.gpu_index(0));
-  CudaRadixCiphertextFFI output;
-  into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
-  CudaRadixCiphertextFFI input;
-  into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
-  // This is a special case only for our noise sanity checks
-  // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
-  // This replicates the CPU fallback behaviour of the casting expand
-  if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
    integer_radix_apply_univariate_lookup_table<Torus>(
-        streams, &output, &input, bsks, ksks, mem_ptr->identity_lut,
-        2 * num_lwes);
-    return;
-  }
+        streams, &output, &input, bsks, ksks, final_lut, 2 * num_lwes);

-  integer_radix_apply_univariate_lookup_table<Torus>(
-      streams, &output, &input, bsks, ksks, message_and_carry_extract_luts,
-      2 * num_lwes);
-  release_cpu_radix_ciphertext_async(&input);
-  release_cpu_radix_ciphertext_async(&output);
+    release_cpu_radix_ciphertext_async(&input);
+    release_cpu_radix_ciphertext_async(&output);
+  }
  compact_lwe_lists.release();
 }

--- a/backends/tfhe-cuda-backend/get_os_name.sh
+++ b/backends/tfhe-cuda-backend/get_os_name.sh
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-cat /etc/os-release | grep "\<NAME\>" | sed "s/NAME=\"//g" | sed "s/\"//g"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Andrei Stoian	64229ca391	fix(gpu): refactor crypto params in backend	2026-04-27 13:09:50 +02:00
dependabot[bot]	8bc080355d	chore(deps): bump zizmorcore/zizmor-action from 0.5.2 to 0.5.3 Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.5.2 to 0.5.3. - [Release notes](https://github.com/zizmorcore/zizmor-action/releases) - [Commits](`71321a20a9...b1d7e1fb5d`) --- updated-dependencies: - dependency-name: zizmorcore/zizmor-action dependency-version: 0.5.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-27 10:29:38 +02:00
dependabot[bot]	0cc8d625e4	chore(deps): bump actions/setup-node from 6.3.0 to 6.4.0 Bumps [actions/setup-node](https://github.com/actions/setup-node) from 6.3.0 to 6.4.0. - [Release notes](https://github.com/actions/setup-node/releases) - [Commits](`53b83947a5...48b55a011b`) --- updated-dependencies: - dependency-name: actions/setup-node dependency-version: 6.4.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-27 10:29:25 +02:00
Arthur Meyre	ec5d0da883	chore: bump ntt requirement which should have been 0.7.1 already	2026-04-27 09:49:03 +02:00
Arthur Meyre	8ed5633300	chore(hl): export two missing (Compressed)ReRandomizationKey types	2026-04-23 15:32:17 +02:00
David Testé	cf07dcf6a3	chore(docs): update leading-trailing zeros results	2026-04-23 15:16:54 +02:00
Arthur Meyre	20dad23256	chore: bump rand to 0.8.6 in data generation crate - 1.6 is done in a separate PR which will use the officially published tag as source for the code, which also updates the lock	2026-04-23 14:35:02 +02:00
Nicolas Sarlin	d7380e4264	chore(backward): use released tfhe for generate_1_6 dep	2026-04-23 14:34:41 +02:00
Nicolas Sarlin	093ffb7699	chore(ci): update toolchain to nightly 2026-04-22	2026-04-23 10:08:57 +02:00
Arthur Meyre	c804b838cb	chore: update typos file filter - with HPU data file checked out the typos CLI finds typos in essentially binary data - exclude .hpu files from the checks	2026-04-22 17:22:15 +02:00
Arthur Meyre	7b174b1865	chore: make the plaintext PRF available as a test util - KMS is testing things around the PRF and they need a way to verify the PRF application, so making a cleartext PRF function available as a test utils	2026-04-22 10:18:32 +02:00
Arthur Meyre	79cb6b6066	chore: dirty fix for zk-cuda-backend rust build	2026-04-22 10:18:21 +02:00
Nicolas Sarlin	6ff87e94bb	chore(gpu): remove os detection script (done in rust)	2026-04-22 10:04:52 +02:00
Thomas Montaigu	4c27f48968	chore(oprf): add missing into/from raw parts	2026-04-22 00:25:44 +02:00
Arthur Meyre	8bf2a12e9b	chore: dirty fix for zk-cuda-backend build problem - when compiling for real it cannot find the file which is not available	2026-04-21 17:23:30 +02:00
Arthur Meyre	64b5a0fdcd	chore: fix cuda release workflow	2026-04-21 16:30:30 +02:00
Thomas Montaigu	49c390edef	refactor(oprf): change hashed data	2026-04-21 14:43:17 +02:00
Thomas Montaigu	82860a0b01	refactor(oprf)!: use a dedicated key for oprf The OPRF is a simple bootstrap, however as it uses a custom modulus switch I decided to define a new type and not re-use the ShortintBoostrapKey, except for GPU where it was easier to reuse it. This means that shortint/integer APIs must now create an OprkPrivateKey + OprfServerKey to do oprf (or use .as_oprf_key_view) In the HLAPI no breaking change as we can use either dedicated key or fallback on the compute bsk This refactor makes the shortint oprf able to generate multiple blocks at once starting from the same seed. This is to follow some guidelines. This means that shortint's oprf now has a function doing most of the all to generate Ciphertext that encrypts random bits split evenly amongst multiple blocks	2026-04-21 14:43:17 +02:00
Theo Souchon	39ca504ce4	chore(lint): change report backward to have the right behavior for message generation	2026-04-21 14:34:13 +02:00
dependabot[bot]	61c7ffea2e	chore(deps): bump actions/upload-artifact from 7.0.0 to 7.0.1 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 7.0.0 to 7.0.1. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`bbbca2ddaa...043fb46d1a`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: 7.0.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-21 12:08:45 +02:00
Nicolas Sarlin	48bb3833e7	fix(shortint): proven ct list expand with a ksk but no fn fails	2026-04-20 14:15:54 +02:00
Theo Souchon	2ad2f522db	chore(lint): remove upgrade false positive warning if new variant added in an enum versioned	2026-04-20 08:24:29 +02:00
Nicolas Sarlin	2333a5591e	chore(ci): check that Cargo.lock of generate_ crates is up to date	2026-04-17 17:33:59 +02:00
David Testé	9e3e283741	doc(bench): update benchmark results tables	2026-04-17 12:52:37 +02:00
Arthur Meyre	e3b9fd56df	feat: add mul div entry points - this operation has optimization opportunities (at least for the scalar mul and scalar div case) but those won't be implemented here, this is a first commit to make the API available	2026-04-17 11:04:32 +02:00
Thomas Montaigu	05b1c9a651	feat(hlapi): bind CudaServerKey::contains	2026-04-16 16:20:11 +02:00
Thomas Montaigu	8d2caa108a	chore(hlapi): add gpu stuff to FheInteger trait	2026-04-16 16:20:11 +02:00
Thomas Montaigu	dea1b81b06	feat(hlapi): add contains for cpu	2026-04-16 16:20:11 +02:00
Arthur Meyre	a1dc91af4f	chore: update rand version in tfhe-hpu-backend - to silence a soundness warning (which does not concern us since we don't use the faulty mechanism)	2026-04-16 16:11:34 +02:00
Arthur Meyre	b34b7d39f1	chore: remove unused deps from mockup - those are not referenced at all in the code - clap-num is less clear since clap is used, so left it in for now	2026-04-16 16:11:34 +02:00
Arthur Meyre	dc14834559	chore: bump tfhe-hpu-backend after erc7984 update	2026-04-16 11:51:58 +02:00
Nicolas Sarlin	10ab4f4409	feat: add re_randomization for ProvenCompactCiphertList	2026-04-16 11:37:21 +02:00
Nicolas Sarlin	d5439a9f48	fix(core): check that ct modulus is power of two in glwe algebra	2026-04-16 11:37:21 +02:00
Mayeul@Zama	e299dc2af7	feat(integer): add improved leading_zeroes	2026-04-15 17:29:05 +02:00
Enzo Di Maria	bdb75ec806	fix(gpu): AES noise fix	2026-04-15 17:08:04 +02:00
Andrei Stoian	32cf1969bf	fix(gpu): semgrep step in pcc now fails on error	2026-04-15 14:20:34 +02:00
Andrei Stoian	600a30131e	chore(gpu): optimize CI	2026-04-15 12:48:31 +02:00
David Palm	96d230cf6f	chore: make CompressedXofKeySet::decompress take a reference	2026-04-14 16:24:33 +02:00
Nicolas Sarlin	4790f8ba1c	fix(bench): wrong size in wasm benchmarks	2026-04-14 11:17:11 +02:00
dependabot[bot]	79a54df25b	chore(deps): bump docker/login-action from 4.0.0 to 4.1.0 Bumps [docker/login-action](https://github.com/docker/login-action) from 4.0.0 to 4.1.0. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](`b45d80f862...4907a6ddec`) --- updated-dependencies: - dependency-name: docker/login-action dependency-version: 4.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-14 10:32:05 +02:00
Theo Souchon	50d6be121a	chore(test): refacto around noise check test and json output	2026-04-14 09:58:20 +02:00
Nicolas Sarlin	7cd966d8a7	chore: allow rand audit advisory	2026-04-14 08:54:05 +02:00
David Testé	6ca929051d	chore(ci): remove permanent instance fallback for gpu These fallback were set to mitigate Hyperstack resource shortages. Those instances are not used anymore and workflows are modified to avoid having a workflow run stuck because it waits for a permanent runner that doesn't exist.	2026-04-10 14:30:32 +02:00
Pedro Alves	871cc8f772	chore(docs): rewrite GPU ZK-PoK documentation for zk-cuda-backend integration	2026-04-10 08:40:08 -03:00
Theo Souchon	b938473788	chore: renamed erc20 to erc7984	2026-04-10 09:18:51 +02:00
Nicolas Sarlin	74869f5e2f	chore(integer): refactor expansion helper - Split the pure expand and the post processing (cast, unpack, sanitize) - Add a new internal intermediate type: ExpandedCiphertextList - verify_and_expand just calls verify+expand	2026-04-09 11:07:03 +02:00
dependabot[bot]	326dd6a5c7	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 5.0.1 to 5.0.4. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`70c4af2ed5...ca46236c6c`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-version: 5.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-08 09:42:31 +02:00
Guillermo Oyarzun	1abc69751a	feat(gpu): create noise and pfail tests for rerand	2026-04-07 20:33:31 +02:00
Pedro Alves	3c2cb273d5	chore(docs): add GPU ZK benchmark SVG generation to CI pipeline Add GPU ZK benchmark and SVG generation jobs to the documentation workflows, and fix the data extractor to handle the cuda::zk:: prefix chain in GPU ZK benchmark names.	2026-04-07 05:02:04 -03:00
dependabot[bot]	b18060e5c8	chore(deps): bump codecov/codecov-action from 5.5.2 to 6.0.0 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.5.2 to 6.0.0. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`671740ac38...57e3a136b7`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-version: 6.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-07 09:51:30 +02:00
dependabot[bot]	c8827a21a7	chore(deps): bump rust-lang/crates-io-auth-action from 1.0.3 to 1.0.4 Bumps [rust-lang/crates-io-auth-action](https://github.com/rust-lang/crates-io-auth-action) from 1.0.3 to 1.0.4. - [Release notes](https://github.com/rust-lang/crates-io-auth-action/releases) - [Commits](`b7e9a28ede...bbd81622f2`) --- updated-dependencies: - dependency-name: rust-lang/crates-io-auth-action dependency-version: 1.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-04-07 09:51:08 +02:00
Andrei Stoian	a7476d0aaa	chore(gpu): update benchmarks GPU fix(gpu): pbs benches fix(gpu): pbs benches	2026-04-03 11:29:51 +02:00
David Testé	10d104e500	chore: update copyright year to 2026	2026-04-03 10:22:13 +02:00
David Testé	dbb1f151c8	chore(ci): add release workflow for zk-cuda-backend	2026-04-01 11:14:31 +02:00
David Testé	9cb8ad9bff	chore(ci): create common cuda release workflow This refactorting is done to add zk-cuda-backend crate release without duplicating the logic in the new workflow.	2026-04-01 11:14:31 +02:00
David Testé	d970210ae4	chore(ci): update slab-github-runner action to v1.6.0 This action version now uses node24 as runner since node20 support is dropped on April 2026.	2026-04-01 09:47:44 +02:00
David Palm	5236c21733	chore: Move safe-serialization to own crate and wire it up with the workspace.	2026-03-31 16:30:40 +02:00
Guillermo Oyarzun	7598725c7e	feat(gpu): add pbs128 pattern to multi-bit noise test	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	f0cff6176d	feat(gpu): add cpk ks ms pattern to multi-bit noise tests	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	8bb38d4e70	feat(gpu): add packing ks multi-bit noise tests	2026-03-31 14:30:01 +02:00
Guillermo Oyarzun	35fe71cc07	feat(gpu): add br_dp_ks_ms pattern to multi-bit noise tests	2026-03-31 14:30:01 +02:00
Nicolas Sarlin	62429da859	chore(ci): publish tfhe-compat js package	2026-03-31 13:40:08 +02:00
dependabot[bot]	8a4b3c35f4	chore(deps): bump actions/cache from 5.0.3 to 5.0.4 Bumps [actions/cache](https://github.com/actions/cache) from 5.0.3 to 5.0.4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](`cdf6c1fa76...668228422a`) --- updated-dependencies: - dependency-name: actions/cache dependency-version: 5.0.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-03-30 14:31:27 +02:00
Theo Souchon	641fec028f	chore(lint): add message for the backward compat report when everything is ok	2026-03-30 13:25:58 +02:00
Nicolas Sarlin	8d8379409b	chore(shortint): remove 'parallel-wasm-api' feature gating This used to be required but now rayon handles this gracefully and runs the code sequentially when threads are not available	2026-03-30 11:58:52 +02:00
Nicolas Sarlin	d547e67f66	refactor(hl): factorize hl proven ct list expand code	2026-03-30 11:58:40 +02:00
Arthur Meyre	4cf03c063d	chore: update Cargo.locks for generate crates	2026-03-30 11:02:17 +02:00