fix(gpu): refactor crypto params in backend

chore(deps): bump zizmorcore/zizmor-action from 0.5.2 to 0.5.3
Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.5.2 to 0.5.3. - [Release notes](https://github.com/zizmorcore/zizmor-action/releases) - [Commits](71321a20a9...b1d7e1fb5d) --- updated-dependencies: - dependency-name: zizmorcore/zizmor-action dependency-version: 0.5.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>
2026-04-28 03:01:21 -04:00 · 2026-04-27 13:09:50 +02:00 · 2026-04-27 10:29:38 +02:00 · 2026-04-27 10:29:25 +02:00 · 2026-04-27 09:49:03 +02:00 · 2026-04-23 15:32:17 +02:00
494 changed files with 33444 additions and 11800 deletions
--- a/.github/workflows/aws_data_tests.yml
+++ b/.github/workflows/aws_data_tests.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -16,7 +16,6 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
@@ -37,6 +36,7 @@ jobs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
+      safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -79,6 +79,7 @@ jobs:
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+              - utils/tfhe-safe-serialize/**
            csprng:
              - tfhe-csprng/**
            zk_pok:
@@ -86,6 +87,8 @@ jobs:
            versionable:
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
+            safe_serialize:
+              - utils/tfhe-safe-serialize/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -122,6 +125,7 @@ jobs:
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
          steps.changed-files.outputs.versionable_any_changed == 'true' ||
+          steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -145,7 +149,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -170,6 +174,11 @@ jobs:
        run: |
          make test_versionable

+      - name: Run tfhe-safe-serialize tests
+        if: needs.should-run.outputs.safe_serialize_test == 'true'
+        run: |
+          make test_safe_serialize
+
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -191,7 +200,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -204,7 +213,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -34,7 +34,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -14,12 +14,11 @@ env:
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]

 permissions:
  contents: read
@@ -32,16 +31,16 @@ jobs:
    if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read  # Needed to check for file change
+      pull-requests: read # Needed to check for file change
    outputs:
      wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
-          steps.changed-files.outputs.wasm_any_changed }}
+        steps.changed-files.outputs.wasm_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -63,6 +62,7 @@ jobs:
                - tfhe/js_on_wasm_tests/**
                - tfhe/web_wasm_parallel_tests/**
                - utils/tfhe-versionable/**
+                - utils/tfhe-safe-serialize/**
                - .github/workflows/aws_tfhe_wasm_tests.yml

  wasm-tests:
@@ -78,7 +78,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
@@ -92,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -105,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -128,15 +128,21 @@ jobs:
        run: |
          make test_nodejs_wasm_api_ci

-      - name: Run parallel wasm tests
-        run: |
-          make test_web_js_api_parallel_chrome_ci
-
      - name: Run wasm_par_mq tests
        run: |
          make test_wasm_par_mq_chrome_ci
          make test_wasm_par_mq_firefox_ci

+      - name: Run parallel wasm tests
+        run: |
+          make test_web_js_api_parallel_chrome_ci
+          make test_web_js_api_parallel_firefox_ci
+
+      - name: Run cross origin wasm tests
+        run: |
+          make test_web_js_api_cross_origin_chrome_ci
+          make test_web_js_api_cross_origin_firefox_ci
+
      - name: Run x86_64/wasm zk compatibility tests
        run: |
          make test_zk_wasm_x86_compat_ci
--- a/.github/workflows/backward_compat_pr_change_report.yml
+++ b/.github/workflows/backward_compat_pr_change_report.yml
@@ -6,6 +6,9 @@ name: backward_compat_pr_change_report
 on:
  pull_request:

+env:
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
 permissions:
  contents: read

@@ -14,9 +17,35 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  should-run:
+    name: backward_compat_pr_change_report/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            backward:
+              - utils/tfhe-lints/snapshots/*.json
+
  change-report:
    name: backward_compat_pr_change_report/change-report (bpr)
    runs-on: ubuntu-latest
+    needs: should-run
+    if:
+      needs.should-run.outputs.backward_report == 'true'
    permissions:
      pull-requests: write # To send and modify message in the PR
    steps:
@@ -50,19 +79,11 @@ jobs:
            exit 1
          fi

-      - name: Find existing comment
+      - name: Post/refresh backward-compat report
        if: steps.report.outputs.has_report == 'true'
-        id: find-comment
-        uses: peter-evans/find-comment@3eae4d37986fb5a8592848f6a574fdf654e61f9e # v3.1.0
+        uses: marocchino/sticky-pull-request-comment@0ea0beb66eb9baf113663a64ec522f60e49231c0
        with:
-          issue-number: ${{ github.event.pull_request.number }}
-          body-includes: '**Backward-compat snapshot:'
-
-      - name: Comment on PR
-        if: steps.report.outputs.has_report == 'true'
-        uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0
-        with:
-          comment-id: ${{ steps.find-comment.outputs.comment-id }}
-          issue-number: ${{ github.event.pull_request.number }}
-          body-path: report.md
-          edit-mode: replace
+          header: backward-compat-snapshot
+          hide_and_recreate: true
+          hide_classify: OUTDATED
+          path: report.md
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -14,11 +14,12 @@ on:
          - signed_integer
          - integer_compression
          - integer_zk
+          - msm_zk
          - shortint
          - shortint_oprf
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
          - hlapi_kvstore
@@ -92,8 +93,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -107,7 +107,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set martix arguments outputs
+      - name: Set matrix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -223,7 +223,7 @@ jobs:
          results_type: ${{ inputs.additional_results_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -108,14 +108,14 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-hlapi-erc20:
-    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
+  run-benchmarks-hlapi-erc7984:
+    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
-      command: hlapi_erc20
-      additional_file_to_parse: erc20_pbs_count.csv
+      command: hlapi_erc7984
+      additional_file_to_parse: erc7984_pbs_count.csv
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_ct_key_sizes
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -17,6 +17,10 @@ on:
        description: "Run GPU core-crypto benchmarks"
        type: boolean
        default: true
+      run-gpu-zk-benchmarks:
+        description: "Run GPU ZK benchmarks"
+        type: boolean
+        default: true
      run-hpu-benchmarks:
        description: "Run HPU benchmarks"
        type: boolean
@@ -36,7 +40,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -91,7 +95,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc20
+      command: integer_multi_bit,hlapi_erc7984
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -110,7 +114,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer,hlapi_erc7984
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -165,21 +169,42 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

+  run-benchmarks-gpu-zk-server:
+    name: benchmark_documentation/run-benchmarks-gpu-zk-server
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-zk-benchmarks
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100-SXM5x8
+      command: integer_zk
+      op_flavor: default
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
-      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
+      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
+      run-benchmarks-gpu-zk-server
    ]
    uses: ./.github/workflows/generate_svgs.yml
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -188,7 +213,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,10 +31,13 @@ on:
          - pbs128
          - ks
          - ks_pbs
+          - tfhe_zk_pok
+          - msm_zk
          - integer_zk
+          - integer_zk_experimental
          - integer_aes
          - integer_aes256
-          - hlapi_erc20
+          - hlapi_erc7984
          - hlapi_dex
          - hlapi_noise_squash
      op_flavor:
@@ -120,8 +123,8 @@ jobs:

          if inputs_command == "integer_zk":
            files_to_parse.append("pke_zk_crs_sizes.csv")
-          elif inputs_command == "hlapi_erc20":
-            files_to_parse.append("erc20_pbs_count.csv")
+          elif inputs_command == "hlapi_erc7984":
+            files_to_parse.append("erc7984_pbs_count.csv")
          elif inputs_command == "hlapi_dex":
            files_to_parse.extend(
              [
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -89,7 +89,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -173,7 +173,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -111,7 +111,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set martix arguments outputs
+      - name: Set matrix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -126,17 +126,11 @@ jobs:
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,25 +139,6 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

-      - name: Acknowledge remote instance failure
-        if: steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile != 'single-h100'
-        run: |
-          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
-          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
-          exit 1
-        env:
-          INPUTS_PROFILE: ${{ inputs.profile }}
-
-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' &&
-          steps.start-remote-instance.outcome == 'failure' &&
-          inputs.profile == 'single-h100'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  # Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
  install-dependencies:
    name: benchmark_gpu_common/install-dependencies
@@ -184,7 +159,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -296,7 +270,7 @@ jobs:
          filenames: ${{ inputs.additional_file_to_parse }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -333,13 +307,13 @@ jobs:

  teardown-instance:
    name: benchmark_gpu_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -42,7 +42,7 @@ env:
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
  SCHEDULING_POLICY: "MAX_PARALLELISM"
-  BENCHMARKS: "erc20"
+  BENCHMARKS: "erc7984"
  BRANCH_NAME: ${{ github.ref_name }}
  COMMIT_SHA: ${{ github.sha }}
  SLAB_SECRET: ${{ secrets.JOB_SECRET }}
@@ -77,7 +77,7 @@ jobs:
          if [[ ${IS_MANUAL_RUN} == true ]]; then
            PROFILE_RAW="${PROFILE_MANUAL_RUN}"
          else
-            PROFILE_RAW="${PROFILE}"
+            PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
          fi
          # shellcheck disable=SC2001
          PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
@@ -94,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: |
            ~/.cargo/registry
@@ -214,14 +214,14 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
@@ -232,7 +232,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: 20.x

@@ -248,13 +248,13 @@ jobs:
          npm install && npm run deploy:emptyProxies && npx hardhat compile
        working-directory: fhevm/

-      - name: Profile erc20 no-cmux benchmark on GPU
+      - name: Profile erc7984 no-cmux benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
          BENCHMARK_TYPE="THROUGHPUT_200" \
          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
-          make -e "profile_erc20_gpu"
+          make -e "profile_erc7984_gpu"
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
@@ -271,7 +271,7 @@ jobs:
      - name: Upload profile artifact
        env:
          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ env.REPORT_NAME }}
          path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}
@@ -302,7 +302,7 @@ jobs:
        working-directory: fhevm/

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${COMMIT_SHA}_${BENCHMARKS}_${{ needs.parse-inputs.outputs.profile }}
          path: fhevm/$${{ env.RESULTS_FILENAME }}
@@ -333,7 +333,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -14,7 +14,7 @@ on:
          - integer
          - hlapi_unsigned
          - hlapi_signed
-          - hlapi_erc20
+          - hlapi_erc7984
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -95,7 +95,7 @@ jobs:
            ]:
              f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")

-      - name: Set martix arguments outputs
+      - name: Set matrix arguments outputs
        id: set_matrix_args
        run: | # zizmor: ignore[template-injection] these env variable are safe
          {
@@ -121,7 +121,7 @@ jobs:
    steps:
      # Needed as long as hw_regmap repository is private
      - name: Configure SSH
-        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
+        uses: webfactory/ssh-agent@e83874834305fe9a4a2997156cb26c5de65a8555 # v0.10.0
        with:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

@@ -185,7 +185,7 @@ jobs:
          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -280,7 +280,7 @@ jobs:
          BENCH_TYPE: ${{ env.__TFHE_RS_BENCH_TYPE }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_regression_${{ env.RESULTS_FILE_SHA }} # RESULT_FILE_SHA is needed to avoid collision between matrix.command runs
          path: ${{ env.RESULTS_FILENAME }}
@@ -387,7 +387,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -40,7 +40,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_ntt
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -63,7 +63,7 @@ jobs:
          with open(env_file, "a") as f:
            f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")

-      - name: Set martix arguments output
+      - name: Set matrix arguments output
        id: set_matrix_arg
        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
@@ -77,7 +77,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -137,7 +137,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -158,9 +158,9 @@ jobs:
        env:
          BROWSER: ${{ matrix.browser }}

-      - name: Run benchmarks (unsafe coop)
+      - name: Run benchmarks (cross origin)
        run: |
-          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
+          make bench_web_js_api_cross_origin_"${BROWSER}"_ci
        env:
          BROWSER: ${{ matrix.browser }}

@@ -180,7 +180,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -218,7 +218,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -94,7 +94,7 @@ jobs:
          with open(env_file, "a") as f:
            f.write(f"""RUNNERS=["{'", "'.join(runners)}"]\n""")

-      - name: Set martix runners outputs
+      - name: Set matrix runners outputs
        id: set_matrix_runners
        run: | # zizmor: ignore[template-injection] these env variable are safe
          echo "runners=${{ toJSON(env.RUNNERS) }}" >> "${GITHUB_OUTPUT}"
@@ -138,7 +138,7 @@ jobs:
      - name: Node cache restoration
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -151,7 +151,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -43,14 +43,14 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@0dce2577a4760a2749d8cfb7a84b7d5585ebcb7d # v0.5.0
+        uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -74,7 +74,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -88,7 +88,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -87,7 +87,7 @@ jobs:

      - name: Upload tables
        if: inputs.backend_comparison == false
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
          # This will upload all the file generated
@@ -111,7 +111,7 @@ jobs:

      - name: Upload comparison tables
        if: inputs.backend_comparison == true
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a
        with:
          name: ${{ github.sha }}_backends_comparison_tables
          # This will upload all the file generated
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -209,60 +209,98 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

+  gpu-zk-server-latency-table:
+    name: generate_documentation_svgs/gpu-zk-server-latency-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: latency
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-latency
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  gpu-zk-server-throughput-table:
+    name: generate_documentation_svgs/gpu-zk-server-throughput-table
+    uses: ./.github/workflows/generate_svg_common.yml
+    if: inputs.generate-gpu-svgs
+    with:
+      backend: gpu
+      hardware_name: n3-H100-SXM5x8
+      layer: integer
+      bench_subset: zk
+      pbs_kind: multi_bit
+      grouping_factor: 4
+      bench_type: throughput
+      time_span_days: ${{ inputs.time_span_days }}
+      output_filename: gpu-zk-benchmark-throughput
+    secrets:
+      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
+      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
+      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
  # -----------------------------------------------------------
-  # ERC20 benchmarks tables
+  # ERC7984 benchmarks tables
  # -----------------------------------------------------------

-  cpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
+  cpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
      backend: cpu
      hardware_name: hpc7a.96xlarge
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
+      output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  gpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
+  gpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-gpu-svgs
    with:
      backend: gpu
      hardware_name: n3-H100-SXM5x8
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: multi_bit
      grouping_factor: 4
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
+      output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  hpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
+  hpu-erc7984-latency-throughput-table:
+    name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-hpu-svgs
    with:
      backend: hpu
      hardware_name: hpu_x1
      layer: hlapi
-      bench_subset: erc20
+      bench_subset: erc7984
      pbs_kind: classical
      bench_type: both
      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
+      output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -62,29 +63,24 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,13 +89,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -132,7 +121,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_core_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -77,7 +77,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,17 +25,11 @@ jobs:
    name: gpu_full_h100_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,13 +38,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: gpu_full_h100_tests/cuda-tests-linux
    needs: [ setup-instance ]
@@ -74,7 +61,6 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -118,13 +104,13 @@ jobs:

  teardown-instance:
    name: gpu_full_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +186,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -65,27 +66,23 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -133,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -184,14 +173,14 @@ jobs:

  teardown-instance:
    name: gpu_hlapi_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -17,8 +17,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Nightly tests will be triggered each evening 8p.m.
-    - cron: "0 20 * * *"
+    # Weekly tests will be triggered every Monday at 8p.m.
+    - cron: "0 20 * * 1"
  pull_request:


@@ -28,17 +28,48 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  should-run:
+    name: gpu_integer_long_run_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - '.github/workflows/gpu_integer_long_run_tests.yml'
+
  setup-instance:
    name: gpu_integer_long_run_tests/setup-instance
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    needs: [should-run]
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +143,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -74,7 +74,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -131,6 +131,10 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          make semgrep_and_lint_gpu_code
+
      - name: Run fmt checks
        run: |
          make check_fmt_gpu
@@ -139,10 +143,6 @@ jobs:
        run: |
          make pcc_gpu

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          make semgrep_and_lint_gpu_code
-
      - name: Run semver checks on tfhe-cuda-backend
        run: |
          make semver_check_cuda_backend
@@ -176,7 +176,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,30 +64,25 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_signed_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +177,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh

@@ -80,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -169,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,30 +64,25 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
-      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
-      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
-      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
-      # otherwise we'll try to run the next job on a non-existing on-demand instance.
-      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        continue-on-error: true
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -95,13 +91,6 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This will allow to fallback on permanent instances running on Hyperstack.
-      - name: Use permanent remote instance
-        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
-        run: |
-          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
-
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
@@ -134,7 +123,6 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
@@ -176,14 +164,14 @@ jobs:

  teardown-instance:
    name: gpu_unsigned_integer_h100_tests/teardown-instance
-    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh

@@ -81,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +177,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -51,7 +51,13 @@ jobs:
        with:
          files_yaml: |
            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/zk/**
+              - tfhe-zk-pok/**
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

@@ -67,7 +73,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -126,6 +132,9 @@ jobs:
      - name: Run zk-cuda-backend integration tests
        run: |
          make test_zk_cuda_backend
+          make test_zk_pok_experimental_gpu
+          make test_integer_zk_gpu
+          make test_integer_zk_experimental_gpu

  slack-notify:
    name: gpu_zk_tests/slack-notify
@@ -158,7 +167,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -62,7 +62,7 @@ jobs:
          PACKAGE: ${{ inputs.package-name }}
        run: |
          cargo package -p "${PACKAGE}"
-      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate
@@ -101,13 +101,13 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
          name: crate-${{ inputs.package-name }}
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
--- a/.github/workflows/make_release_common_cuda.yml
+++ b/.github/workflows/make_release_common_cuda.yml
@@ -1,12 +1,36 @@
-name: make_release_cuda
+# Common workflow to make crate release for CUDA backend
+name: make_release_common_cuda

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
-      dry_run:
-        description: "Dry-run"
+      package-name:
+        type: string
+        required: true
+      dry-run:
        type: boolean
        default: true
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true
+      ALLOWED_TEAM:
+        required: true
+      READ_ORG_TOKEN:
+        required: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -21,15 +45,15 @@ permissions: {}

 jobs:
  verify-triggering-actor:
-    name: make_release_cuda/verify-triggering-actor
+    name: make_release_common_cuda/verify-triggering-actor
    if: startsWith(github.ref, 'refs/tags/')
    uses: ./.github/workflows/verify_triggering_actor.yml
    secrets:
-      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  setup-instance:
-    name: make_release_cuda/setup-instance
+    name: make_release_common_cuda/setup-instance
    needs: verify-triggering-actor
    runs-on: ubuntu-latest
    outputs:
@@ -37,7 +61,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,7 +71,7 @@ jobs:
          profile: gpu-build

  package:
-    name: make_release_cuda/package
+    name: make_release_common_cuda/package
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    outputs:
@@ -76,7 +100,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -89,7 +112,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -101,12 +123,14 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Prepare package
+        env:
+          PACKAGE: ${{ inputs.package-name }}
        run: |
-          cargo package -p tfhe-cuda-backend
+          cargo package -p "${PACKAGE}"

-      - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: crate-tfhe-cuda-backend
+          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate

      - name: generate hash
@@ -114,8 +138,8 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
-    name: make_release_cuda/provenance
-    if: ${{ !inputs.dry_run  }}
+    name: make_release_common_cuda/provenance
+    if: ${{ !inputs.dry-run  }}
    needs: [package]
    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
@@ -128,7 +152,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish-cuda-release:
-    name: make_release_cuda/publish-cuda-release
+    name: make_release_common_cuda/publish-cuda-release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    permissions:
@@ -150,7 +174,6 @@ jobs:
          toolchain: stable

      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
        run: |
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          {
@@ -163,7 +186,6 @@ jobs:

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
        run: |
          {
            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -174,25 +196,33 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

-      - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
-          name: crate-tfhe-cuda-backend
+          fetch-depth: 0
+          persist-credentials: "false"
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Download artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: crate-${{ inputs.package-name }}
          path: target/package

      - name: Authenticate on registry
-        uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
+        uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
        id: auth

      - name: Publish crate.io package
        env:
          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+          PACKAGE: ${{ inputs.package-name }}
+          DRY_RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-cuda-backend ${DRY_RUN}
+          cargo publish -p "${PACKAGE}" ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -204,7 +234,7 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -212,17 +242,17 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: make_release_cuda/teardown-instance
+    name: make_release_common_cuda/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, publish-cuda-release]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -232,7 +262,7 @@ jobs:

      - name: Slack Notification
        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -16,6 +16,10 @@ on:
        description: "Push web js package"
        type: boolean
        default: true
+      push_web_compat_package:
+        description: "Push web compat (cross-origin) js package"
+        type: boolean
+        default: true
      push_node_package:
        description: "Push node js package"
        type: boolean
@@ -85,7 +89,7 @@ jobs:
          make build_web_js_api_parallel

      - name: Authenticate on NPM
-        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '24'
          registry-url: 'https://registry.npmjs.org'
@@ -99,6 +103,23 @@ jobs:
          tag: ${{ env.NPM_TAG }}
          provenance: true

+      - name: Build web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        run: |
+          rm -rf tfhe/pkg
+
+          make build_web_js_api
+          sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
+
+      - name: Publish web compat (cross-origin) package
+        if: ${{ inputs.push_web_compat_package }}
+        uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
+        with:
+          package: tfhe/pkg/package.json
+          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}
+          provenance: true
+
      - name: Build Node package
        if: ${{ inputs.push_node_package }}
        run: |
--- a/.github/workflows/make_release_tfhe_cuda.yml
+++ b/.github/workflows/make_release_tfhe_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of tfhe-rs CUDA backend on crates.io.
+name: make_release_tfhe_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "tfhe-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/make_release_tfhe_safe_serialize.yml
+++ b/.github/workflows/make_release_tfhe_safe_serialize.yml
@@ -0,0 +1,32 @@
+name: make_release_tfhe_safe_serialize
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_tfhe_safe_serialize/make-release
+    uses: ./.github/workflows/make_release_common.yml
+    with:
+      package-name: "tfhe-safe-serialize"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
--- a/.github/workflows/make_release_wasm_par_mq.yml
+++ b/.github/workflows/make_release_wasm_par_mq.yml
--- a/.github/workflows/make_release_zk_cuda.yml
+++ b/.github/workflows/make_release_zk_cuda.yml
@@ -0,0 +1,44 @@
+# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
+name: make_release_zk_cuda
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  make-release:
+    name: make_release_zk_cuda/make-release
+    uses: ./.github/workflows/make_release_common_cuda.yml
+    with:
+      package-name: "zk-cuda-backend"
+      dry-run: ${{ inputs.dry_run }}
+    permissions:
+      actions: read # Needed to detect the GitHub Actions environment
+      id-token: write # Needed to create the provenance via GitHub OIDC
+      contents: write # Needed to upload assets/artifacts
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Restore Sagemath image from cache
        id: docker-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
      - name: Store Sagemath image in cache
        if: steps.docker-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -24,6 +24,8 @@ jobs:
          SOURCE_REPO: "zama-ai/tfhe-rs"
          SOURCE_BRANCH: "main"
          DESTINATION_BRANCH: "main"
+          SOURCE_TAGS: "refs/tags/*"
+          DESTINATION_TAGS: "refs/tags/*"
          USERNAME: ${{ secrets.BOT_USERNAME }}
          TOKEN: ${{ secrets.SYNC_REPO_TOKEN }}
          DEST_REPO: ${{ secrets.SYNC_DEST_REPO }}
@@ -33,49 +35,16 @@ jobs:
          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

-          # The LFS config disables pulling files by default, so remove it
-          # TODO: see if we need to more precisely fetch LFS files or if git is smart
-          rm .lfsconfig
-
          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
-          git fetch source '+refs/heads/*:refs/heads/*' --update-head-ok
+          git fetch --all --tags --update-head-ok --quiet

-          echo ">>> Print out all branches"
-          git --no-pager branch -a -vv
+          echo ">>> Sync LFS items from source..."
+          ./scripts/lfs_sync.sh source destination "${SOURCE_BRANCH}"

-          echo ">>> Pull LFS items from source..."
-          git lfs pull source "${SOURCE_BRANCH}"
-
-          echo ">>> Pushing git changes and LFS content..."
+          echo ">>> Pushing git changes for ${SOURCE_BRANCH}..."
          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f

-          shred --remove .git/config
-
-      - name: git-sync-tags
-        env:
-          SOURCE_REPO: "zama-ai/tfhe-rs"
-          SOURCE_BRANCH: "refs/tags/*"
-          DESTINATION_BRANCH: "refs/tags/*"
-          USERNAME: ${{ secrets.BOT_USERNAME }}
-          TOKEN: ${{ secrets.SYNC_REPO_TOKEN }}
-          DEST_REPO: ${{ secrets.SYNC_DEST_REPO }}
-        run: |
-          echo ">>> Cloning source repo..."
-          git lfs install
-          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
-          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
-
-          # The LFS config disables pulling files by default, so remove it
-          # TODO: see if we need to more precisely fetch LFS files for new tags or if git is smart
-          rm .lfsconfig
-
-          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
-          git fetch source '+refs/heads/*:refs/heads/*' --update-head-ok
-
-          echo ">>> Print out all branches"
-          git --no-pager branch -a -vv
-
-          echo ">>> Pushing git changes and LFS content..."
-          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
+          echo ">>> Pushing git tags..."
+          git push destination "${SOURCE_TAGS}:${DESTINATION_TAGS}" -f

          shred --remove .git/config
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
+backends/tfhe-cuda-backend/cuda/build/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
@@ -34,6 +35,9 @@ node_modules/
 package-lock.json
 utils/wasm-par-mq/examples/*/pkg/

+# Commit lock files of backward data generation crates
+!utils/tfhe-backward-compat-data/crates/generate_*/Cargo.lock
+
 # Python .env
 .env
 __pycache__
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,10 +14,12 @@ members = [
    "tfhe-fft",
    "tfhe-ntt",
    "tfhe-zk-pok",
+    "utils/benchmark_spec",
    "utils/param_dedup",
    "utils/tfhe-backward-compat-checker",
    "utils/tfhe-backward-compat-data",
    "utils/tfhe-backward-compat-data/crates/add_new_version",
+    "utils/tfhe-safe-serialize",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/wasm-par-mq",
@@ -43,6 +45,7 @@ rand = "0.8"
 rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = { version = "0.2.114" }
+wasm-bindgen-futures = { version = "0.4.56" }
 # js-sys (at this point in time) automatically enables the unsafe-eval feature which we do not want
 # this does not prevent other deps from enabling it, but it at least conveys our need to not have it
 # we still enable std, which was part of default before
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/261
+++ b/261
@@ -122,6 +122,12 @@ install_build_wasm32_target:
 	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
 	Rustup can be downloaded at https://rustup.rs/" && exit 1 )

+.PHONY: install_check_wasm32_target # Install the wasm32 toolchain used for checks
+install_check_wasm32_target:
+	rustup target add wasm32-unknown-unknown --toolchain "$(RS_CHECK_TOOLCHAIN)" || \
+	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
+	Rustup can be downloaded at https://rustup.rs/" && exit 1 )
+
 .PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
 install_cargo_nextest:
 	@cargo nextest --version > /dev/null 2>&1 || \
@@ -306,7 +312,7 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
 	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
 		| grep -v '/cmake-build-debug/' \
 		| grep -v '/build/' \
-		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
+		| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
 	venv/bin/python3 "scripts/check_scratch_cleanup.py"

 .PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
@@ -350,23 +356,23 @@ check_fmt_js: check_nvm_installed
 .PHONY: check_fmt_toml # Check TOML files format
 check_fmt_toml: install_taplo
 	@RUST_LOG=warn taplo fmt --check || \
-	echo "TOML files format check failed. Please run 'make fmt_toml'"
+	{ echo "TOML files format check failed. Please run 'make fmt_toml'"; exit 1; }

 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
-	@typos && echo "No typos found"
+	@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" ":!*.hpu" | typos --file-list - && echo "No typos found"

 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
 		--all-targets \
 		-p tfhe

@@ -380,7 +386,7 @@ clippy_hpu: install_rs_check_toolchain
 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

@@ -473,7 +479,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	fi && \
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
 		-p tfhe -- --nocapture

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -484,11 +490,17 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
 		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,parallel-wasm-api \
+		-p tfhe -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
@@ -529,6 +541,15 @@ clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings

+.PHONY: clippy_zk_pok_wasm # Run clippy lints on tfhe-zk-pok for wasm32 target
+clippy_zk_pok_wasm: install_rs_check_toolchain install_check_wasm32_target
+	RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--target wasm32-unknown-unknown \
+		-p tfhe-zk-pok -- --no-deps -D warnings
+	RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--target wasm32-unknown-unknown \
+		-p tfhe-zk-pok --features cross-origin-wasm -- --no-deps -D warnings
+
 .PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
 clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -536,6 +557,11 @@ clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-versionable -- --no-deps -D warnings

+.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
+clippy_safe_serialize: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-safe-serialize -- --no-deps -D warnings
+
 .PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
 clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -561,15 +587,28 @@ clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selec
 		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
 	fi

+.PHONY: check_backward_compat_locks_did_not_change # Check backward compat Cargo.lock files are up to date
+check_backward_compat_locks_did_not_change: install_rs_check_toolchain
+	@for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
+		echo "checking Cargo.lock for $$crate"; \
+		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
+			-C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate metadata --locked --format-version 1 > /dev/null || \
+		( echo "Cargo.lock for $$crate is out of date. Update it with:" && \
+		  echo "  cd $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate && cargo metadata --format-version 1 > /dev/null" && \
+		  echo "then commit the updated Cargo.lock." && exit 1 ); \
+	done
+
 .PHONY: clippy_test_vectors # Run clippy lints on the test vectors app
 clippy_test_vectors: install_rs_check_toolchain
 	cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-test-vectors -- --no-deps -D warnings

+# WARNING: This target is not directly run in CI. When adding a subtarget here,
+# MAKE SURE TO ALSO ADD IT TO A PCC BATCH BELOW
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
+clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
 clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq

 .PHONY: clippy_fast # Run main clippy targets
@@ -666,7 +705,7 @@ build_c_api: install_rs_check_toolchain
 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
 		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -675,11 +714,14 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
 		-p tfhe

-.PHONY: build_web_js_api # Build the js API targeting the web browser
+.PHONY: build_web_js_api # Build the js API targeting the web browser, in sequential or cross origin parallelism modes.
 build_web_js_api: install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api && \
+	find pkg/snippets -type f -iname worker_helpers.js -exec sed -i 's|import("../../..")|import("../../../tfhe.js")|g' {} \;
+	cp utils/wasm-par-mq/js/coordinator.js tfhe/pkg/
+	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 # parallel wasm requires specific build options, see https://github.com/rust-lang/rust/pull/147225
@@ -765,7 +807,7 @@ test_zk_cuda_backend:


 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu:
@@ -1201,12 +1243,31 @@ test_tfhe_csprng_big_endian: install_cargo_cross
 	RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu

-
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok --features experimental

+.PHONY: test_zk_pok_experimental_gpu # Run tfhe-zk-pok GPU-accelerated tests
+test_zk_pok_experimental_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
+
+.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
+test_integer_zk_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--features=integer,zk-pok,gpu -p tfhe -- \
+		integer::gpu::zk::
+
+.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
+test_integer_zk_experimental_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
+		integer::gpu::zk::
+
+.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
+test_zk_cuda: test_zk_cuda_backend test_zk_pok_experimental_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
+
 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
@@ -1225,6 +1286,11 @@ test_versionable:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		--all-targets -p tfhe-versionable

+.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
+test_safe_serialize:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--all-targets -p tfhe-safe-serialize
+
 # The backward compat data folder holds historical binary data but also rust code to generate and load them.
 .PHONY: gen_backward_compat_data # Re-generate backward compatibility data
 gen_backward_compat_data:
@@ -1359,6 +1425,19 @@ test_nodejs_wasm_api_ci: build_node_js_api

 # This is an internal target, not meant to be called on its own.
 run_web_js_api_parallel: build_web_js_api_parallel setup_venv
+	cd $(WEB_SERVER_DIR) && npm install && npm run build
+	source venv/bin/activate && \
+	python ci/webdriver.py \
+	--browser-path $(browser_path) \
+	--driver-path $(driver_path) \
+	--browser-kind  $(browser_kind) \
+	--server-cmd $(server_cmd) \
+	--server-workdir "$(WEB_SERVER_DIR)" \
+	--id-pattern $(filter) \
+	--id-exclude-pattern asyncMainThread
+
+# This is an internal target, not meant to be called on its own.
+run_web_js_api_cross_origin: build_web_js_api setup_venv
 	cd $(WEB_SERVER_DIR) && npm install && npm run build
 	source venv/bin/activate && \
 	python ci/webdriver.py \
@@ -1401,6 +1480,38 @@ test_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) test_web_js_api_parallel_firefox

+test_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+test_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+test_web_js_api_cross_origin_chrome: browser_kind = chrome
+test_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
+test_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
+
+.PHONY: test_web_js_api_cross_origin_chrome # Run tests for the web wasm api in cross-origin mode on Chrome
+test_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
+
+.PHONY: test_web_js_api_cross_origin_chrome_ci # Run tests for the web wasm api in cross-origin mode on Chrome
+test_web_js_api_cross_origin_chrome_ci: setup_venv
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) test_web_js_api_cross_origin_chrome
+
+test_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
+test_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
+test_web_js_api_cross_origin_firefox: browser_kind = firefox
+test_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
+test_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeTest  # Only run zk proof tests in cross-origin mode
+
+.PHONY: test_web_js_api_cross_origin_firefox # Run tests for the web wasm api in cross-origin mode on Firefox
+test_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
+
+.PHONY: test_web_js_api_cross_origin_firefox_ci # Run tests for the web wasm api in cross-origin mode on Firefox
+test_web_js_api_cross_origin_firefox_ci: setup_venv
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) test_web_js_api_cross_origin_firefox
+
 WASM_PAR_MQ_TEST_DIR=utils/wasm-par-mq/web_tests

 .PHONY: build_wasm_par_mq_tests # Build the wasm-par-mq test WASM package
@@ -1557,27 +1668,57 @@ bench_integer_rerand: install_rs_check_toolchain
 	--bench integer-rerand \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

+.PHONY: bench_integer_rerand_gpu # Run benchmarks for integer rerand on GPU backend
+bench_integer_rerand_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-rerand \
+	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
+
+.PHONY: bench_msm_zk
+bench_msm_zk: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-msm \
+	--features=zk-pok -p tfhe-benchmark --profile release --
+
+# GPU benchmarks need --profile release for correct measurements
+.PHONY: bench_msm_zk_gpu
+bench_msm_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-msm \
+	--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release -- zk::cuda::msm
+
+# GPU benchmarks need --profile release for correct measurements
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
+
+# GPU benchmarks need --profile release for correct measurements
+.PHONY: bench_integer_zk_experimental_gpu
+bench_integer_zk_experimental_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --

 .PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
 bench_integer_aes_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
 bench_integer_aes256_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes256 \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
 bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1741,37 +1882,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_parallel_firefox

-bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
-bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
-bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
+bench_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+bench_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+bench_web_js_api_cross_origin_chrome: browser_kind = chrome
+bench_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
+bench_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers

-.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
+.PHONY: bench_web_js_api_cross_origin_chrome # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin

-.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
+.PHONY: bench_web_js_api_cross_origin_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_chrome_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_unsafe_coop_chrome
+	$(MAKE) bench_web_js_api_cross_origin_chrome

-bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
-bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
-bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
+bench_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
+bench_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
+bench_web_js_api_cross_origin_firefox: browser_kind = firefox
+bench_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
+bench_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers

-.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
+.PHONY: bench_web_js_api_cross_origin_firefox # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin

-.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
+.PHONY: bench_web_js_api_cross_origin_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
+bench_web_js_api_cross_origin_firefox_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_unsafe_coop_firefox
+	$(MAKE) bench_web_js_api_cross_origin_firefox

 .PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
 bench_hlapi_unsigned: install_rs_check_toolchain
@@ -1804,25 +1945,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
-bench_hlapi_erc20: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
+bench_hlapi_erc7984: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
-bench_hlapi_erc20_gpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
+bench_hlapi_erc7984_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
-bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
+bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
@@ -1846,13 +1987,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
-bench_hlapi_erc20_hpu: install_rs_check_toolchain
+.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc7984_hpu: install_rs_check_toolchain
 	source ./setup_hpu.sh --config $(HPU_CONFIG); \
 	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
@@ -1860,6 +2001,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

+.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
+bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--package tfhe-zk-pok \
+	--features=gpu-experimental --profile release
+
 .PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
 bench_hlapi_noise_squash: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
@@ -1901,10 +2049,10 @@ bench_summary: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'

 	# DEX
@@ -1946,10 +2094,10 @@ bench_summary_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'

-	# ERC20
+	# ERC7984
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
+	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'

 	# DEX
@@ -2128,6 +2276,7 @@ pcc_batch_5:
 	$(call run_recipe_with_details,clippy_tfhe_lints)
 	$(call run_recipe_with_details,check_compile_tests)
 	$(call run_recipe_with_details,clippy_backward_compat_data)
+	$(call run_recipe_with_details,check_backward_compat_locks_did_not_change)

 .PHONY: pcc_batch_6  # duration: 6'32''
 pcc_batch_6:
@@ -2136,8 +2285,10 @@ pcc_batch_6:
 	$(call run_recipe_with_details,clippy_tasks)
 	$(call run_recipe_with_details,clippy_tfhe_csprng)
 	$(call run_recipe_with_details,clippy_zk_pok)
+	$(call run_recipe_with_details,clippy_zk_pok_wasm)
 	$(call run_recipe_with_details,clippy_trivium)
 	$(call run_recipe_with_details,clippy_versionable)
+	$(call run_recipe_with_details,clippy_safe_serialize)
 	$(call run_recipe_with_details,clippy_param_dedup)
 	$(call run_recipe_with_details,docs)

--- a/_typos.toml
+++ b/_typos.toml
@@ -15,12 +15,3 @@ extend-ignore-identifiers-re = [
    "0x[0-9a-fA-F]+",
    "xrt_coreutil",
 ]
-
-[files]
-extend-exclude = [
-    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
-    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
-    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
-    "*.cbor",
-    "*.bcode",
-]
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.14.0"
+version = "0.15.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2026 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -1,5 +1,14 @@
 use std::path::PathBuf;
-use std::process::Command;
+
+fn get_linux_distribution_name() -> Option<String> {
+    let content = std::fs::read_to_string("/etc/os-release").ok()?;
+    for line in content.lines() {
+        if let Some(value) = line.strip_prefix("NAME=") {
+            return Some(value.trim_matches('"').to_string());
+        }
+    }
+    None
+}

 fn main() {
    if let Ok(val) = std::env::var("DOCS_RS") {
@@ -28,9 +37,7 @@ fn main() {
    println!("cargo::rerun-if-changed=src");

    if std::env::consts::OS == "linux" {
-        let output = Command::new("./get_os_name.sh").output().unwrap();
-        let distribution = String::from_utf8(output.stdout).unwrap();
-        if distribution != "Ubuntu\n" {
+        if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
            println!(
                "cargo:warning=This Linux distribution is not officially supported. \
                Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
--- a/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
+++ b/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
@@ -62,3 +62,29 @@ rules:
            cuda_synchronize_stream(...);
            ...
          }
+
+  - id: tfhe-cuda-unwrapped-cuda-runtime-call
+    message: "CUDA runtime API call is not wrapped in `check_cuda_error(...)`."
+    severity: WARNING
+    languages: [c, cpp]
+    options:
+      generic_ellipsis_max_span: 500
+    paths:
+      include:
+        - "*.cu"
+        - "*.cuh"
+        - "*.cpp"
+        - "*.h"
+      exclude:
+        - backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
+        - backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
+    patterns:
+      - pattern: $FUNC(...)
+      - metavariable-regex:
+          metavariable: $FUNC
+          regex: "^cuda[A-Z][A-Za-z0-9]*$" # matches cudaMalloc/cudaMemcpy/... (not project helpers like cuda_set_device)
+      - pattern-not-inside: check_cuda_error(...)
+      - pattern-not-inside: |
+          $FUNC(...);
+          check_cuda_error(cudaGetLastError());
+      - pattern-not-inside: $FUNC(...) == $VAL
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
@@ -4,22 +4,18 @@

 extern "C" {
 uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism);

 uint64_t scratch_cuda_integer_aes_ctr_256_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism);

 void cuda_integer_aes_ctr_encrypt_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
@@ -34,12 +30,10 @@ void cleanup_cuda_integer_aes_ctr_256_encrypt_64(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_key_expansion_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_key_expansion_64_async(CudaStreamsFFI streams,
                                         CudaRadixCiphertextFFI *expanded_keys,
@@ -57,12 +51,10 @@ void cuda_integer_aes_ctr_256_encrypt_64_async(
    int8_t *mem_ptr, void *const *bsks, void *const *ksks);

 uint64_t scratch_cuda_integer_key_expansion_256_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_key_expansion_256_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *expanded_keys,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -17,10 +17,9 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
-    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_decompress,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t num_blocks_to_decompress,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_compress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -382,14 +382,17 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                       ->use_sequential_algorithm_to_resolve_group_carries;

    cuda_set_device(0);
-    cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming);
+    check_cuda_error(
+        cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
    create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
                                    use_seq, allocate_gpu_memory, size_tracker);
-    cudaEventRecord(create_indexes_done, streams.stream(0));
+    check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
    cuda_set_device(1);
-    cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0);
+    check_cuda_error(
+        cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
    cuda_set_device(2);
-    cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0);
+    check_cuda_error(
+        cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));

    scatter_indexes_for_overflowing_sub(
        streams.stream(1), streams.gpu_index(1),
@@ -842,7 +845,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    free(second_indexes_for_overflow_sub_gpu_2);
    free(scalars_for_overflow_sub_gpu_2);

-    cudaEventDestroy(create_indexes_done);
+    check_cuda_error(cudaEventDestroy(create_indexes_done));

    // release sub streams
    sub_streams_1.release();
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -105,22 +105,32 @@ typedef struct {
  uint32_t polynomial_size;
 } CudaPackedGlweCiphertextListFFI;

+// FFI-boundary parameter struct for a LWE bootstrap key.
+// All fields are plain uint32_t for safe Rust/C++ interop.
+// Use crypto_params() (defined below) to obtain the strongly-typed C++ form.
+typedef struct {
+  uint32_t input_lwe_dimension;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+  uint32_t base_log;
+  uint32_t level_count;
+  uint32_t big_lwe_dimension;
+  uint32_t pbs_type;
+  uint32_t grouping_factor;
+} CudaLweBootstrapKeyParamsFFI;
+
 uint64_t scratch_cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, uint64_t lut_degree,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 uint64_t scratch_cuda_apply_many_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_many_lut, uint64_t lut_degree,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 void cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
@@ -139,12 +149,10 @@ void cuda_apply_many_univariate_lut_64_async(
    uint32_t lut_stride);

 uint64_t scratch_cuda_full_propagation_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_full_propagation_64_inplace_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
@@ -153,26 +161,21 @@ void cuda_full_propagation_64_inplace_async(
 void cleanup_cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_mult_64_async(
+void cuda_integer_mult_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_inout,
+    bool const is_bool_left, CudaRadixCiphertextFFI const *radix_lwe_right,
+    bool const is_bool_right, void *const *bsks, void *const *ksks,
+    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
+
+uint64_t scratch_cuda_integer_mult_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
    bool const is_boolean_right, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
-    uint32_t num_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t carry_modulus, CudaLweBootstrapKeyParamsFFI bsk_params,
+    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_mult_64_async(CudaStreamsFFI streams,
-                                CudaRadixCiphertextFFI *radix_lwe_out,
-                                CudaRadixCiphertextFFI const *radix_lwe_left,
-                                bool const is_bool_left,
-                                CudaRadixCiphertextFFI const *radix_lwe_right,
-                                bool const is_bool_right, void *const *bsks,
-                                void *const *ksks, int8_t *mem_ptr,
-                                uint32_t polynomial_size, uint32_t num_blocks);
-
-void cleanup_cuda_integer_mult_64(CudaStreamsFFI streams,
-                                  int8_t **mem_ptr_void);
+void cleanup_cuda_integer_mult_inplace_64(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void);

 void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
                               CudaRadixCiphertextFFI *lwe_array_out,
@@ -186,12 +189,10 @@ void cuda_scalar_addition_ciphertext_64_inplace(
    uint32_t message_modulus, uint32_t carry_modulus);

 uint64_t scratch_cuda_logical_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_logical_scalar_shift_64_inplace_async(
@@ -199,12 +200,10 @@ void cuda_logical_scalar_shift_64_inplace_async(
    int8_t *mem_ptr, void *const *bsks, void *const *ksks);

 uint64_t scratch_cuda_arithmetic_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_arithmetic_scalar_shift_64_inplace_async(
@@ -218,12 +217,10 @@ void cleanup_cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_shift_and_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_shift_and_rotate_64_inplace_async(
@@ -235,22 +232,18 @@ void cleanup_cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, COMPARISON_TYPE op_type,
    bool is_signed, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_integer_scalar_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, COMPARISON_TYPE op_type,
    bool is_signed, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -273,32 +266,26 @@ void cleanup_cuda_integer_comparison_64(CudaStreamsFFI streams,
 void cleanup_cuda_integer_scalar_comparison_64(CudaStreamsFFI streams,
                                               int8_t **mem_ptr_void);

-uint64_t scratch_cuda_boolean_bitop_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+void cuda_boolean_bitop_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
+    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks);
+
+uint64_t scratch_cuda_boolean_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_boolean_bitop_64_async(CudaStreamsFFI streams,
-                                 CudaRadixCiphertextFFI *lwe_array_out,
-                                 CudaRadixCiphertextFFI const *lwe_array_1,
-                                 CudaRadixCiphertextFFI const *lwe_array_2,
-                                 int8_t *mem_ptr, void *const *bsks,
-                                 void *const *ksks);
-
-void cleanup_cuda_boolean_bitop_64(CudaStreamsFFI streams,
-                                   int8_t **mem_ptr_void);
+void cleanup_cuda_boolean_bitop_inplace_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void);

 uint64_t scratch_cuda_boolean_bitnot_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
    uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -316,51 +303,45 @@ void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
                               uint32_t param_message_modulus,
                               uint32_t param_carry_modulus);

-uint64_t scratch_cuda_integer_bitop_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+uint64_t scratch_cuda_integer_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-uint64_t scratch_cuda_integer_scalar_bitop_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_scalar_bitop_64_async(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
-    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
+void cuda_integer_bitop_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
+    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks);

-void cuda_integer_bitop_64_async(CudaStreamsFFI streams,
-                                 CudaRadixCiphertextFFI *lwe_array_out,
-                                 CudaRadixCiphertextFFI const *lwe_array_1,
-                                 CudaRadixCiphertextFFI const *lwe_array_2,
-                                 int8_t *mem_ptr, void *const *bsks,
-                                 void *const *ksks);
+void cuda_integer_scalar_bitop_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
+    void const *clear_blocks, void const *h_clear_blocks,
+    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks);

-void cleanup_cuda_integer_bitop_64(CudaStreamsFFI streams,
-                                   int8_t **mem_ptr_void);
+void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void);

-void cleanup_cuda_integer_scalar_bitop_64(CudaStreamsFFI streams,
-                                          int8_t **mem_ptr_void);
+void cleanup_cuda_integer_scalar_bitop_inplace_64(CudaStreamsFFI streams,
+                                                  int8_t **mem_ptr_void);

-uint64_t scratch_cuda_cmux_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+uint64_t scratch_cuda_cmux_64_async(CudaStreamsFFI streams, int8_t **mem_ptr,
+                                    CudaLweBootstrapKeyParamsFFI bsk_params,
+                                    uint32_t ks_level, uint32_t ks_base_log,
+                                    uint32_t lwe_ciphertext_count,
+                                    uint32_t message_modulus,
+                                    uint32_t carry_modulus,
+                                    bool allocate_gpu_memory,
+                                    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cmux_64_async(CudaStreamsFFI streams,
                        CudaRadixCiphertextFFI *lwe_array_out,
@@ -372,12 +353,10 @@ void cuda_cmux_64_async(CudaStreamsFFI streams,
 void cleanup_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_scalar_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_scalar_rotate_64_inplace_async(CudaStreamsFFI streams,
@@ -389,21 +368,17 @@ void cleanup_cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

 uint64_t scratch_cuda_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_propagate_single_carry_64_inplace_async(
@@ -425,12 +400,10 @@ void cleanup_cuda_add_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_overflowing_sub_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t compute_overflow, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_overflowing_sub_64_inplace_async(
@@ -445,14 +418,12 @@ void cleanup_cuda_integer_overflowing_sub_64_inplace(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_in_radix,
    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t carry_modulus, bool reduce_degrees_for_single_carry_propagation,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_partial_sum_ciphertexts_vec_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
@@ -463,12 +434,11 @@ void cleanup_cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_scalar_mul_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_scalar_mul_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
@@ -481,11 +451,9 @@ void cleanup_cuda_integer_scalar_mul_64(CudaStreamsFFI streams,

 uint64_t scratch_cuda_integer_div_rem_64_async(
    CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_div_rem_64_async(CudaStreamsFFI streams,
@@ -504,11 +472,9 @@ void cuda_integer_reverse_blocks_64_inplace_async(

 uint64_t scratch_cuda_integer_abs_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_abs_inplace_64_async(CudaStreamsFFI streams,
@@ -520,12 +486,10 @@ void cleanup_cuda_integer_abs_inplace_64(CudaStreamsFFI streams,
                                         int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_are_all_comparisons_block_true_64_async(
@@ -537,12 +501,10 @@ void cleanup_cuda_integer_are_all_comparisons_block_true_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_is_at_least_one_comparisons_block_true_64_async(
@@ -566,13 +528,11 @@ void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
                              CudaStreamsFFI streams);

 uint64_t scratch_cuda_apply_noise_squashing_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t num_original_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t num_original_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_apply_noise_squashing_async(
@@ -584,12 +544,10 @@ void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
                                        int8_t **mem_ptr_void);

 uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_sub_and_propagate_single_carry_64_inplace_async(
@@ -602,13 +560,11 @@ void cleanup_cuda_sub_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_unsigned_scalar_div_radix_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
@@ -619,13 +575,11 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_signed_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_signed_scalar_div_radix_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
@@ -636,12 +590,10 @@ void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_unsigned_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -658,12 +610,10 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_signed_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -679,12 +629,11 @@ void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(CudaStreamsFFI streams,
                                                         int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_count_of_consecutive_bits_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t counter_num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    Direction direction, BitValue bit_value, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t counter_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, Direction direction,
+    BitValue bit_value, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_count_of_consecutive_bits_64_async(
@@ -696,13 +645,12 @@ void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_grouped_oprf_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_process,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_to_process,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t total_random_bits,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_grouped_oprf_64_async(CudaStreamsFFI streams,
                                        CudaRadixCiphertextFFI *radix_lwe_out,
@@ -714,31 +662,28 @@ void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_grouped_oprf_custom_range_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_intermediate,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t num_input_random_bits, uint32_t num_scalar_bits,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_intermediate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t num_input_random_bits,
+    uint32_t num_scalar_bits, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_grouped_oprf_custom_range_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    uint32_t num_blocks_intermediate, const void *seeded_lwe_input,
    const uint64_t *decomposed_scalar, const uint64_t *has_at_least_one_set,
    uint32_t num_scalars, uint32_t shift, int8_t *mem, void *const *bsks,
-    void *const *ksks);
+    void *const *compute_bsks, void *const *ksks);

 void cleanup_cuda_integer_grouped_oprf_custom_range_64(CudaStreamsFFI streams,
                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_ilog2_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t input_num_blocks, uint32_t counter_num_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    uint32_t input_num_blocks, uint32_t counter_num_blocks,
    uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

@@ -754,14 +699,12 @@ void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
                                   int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_match_value_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_matches, uint32_t num_input_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_match_value_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_result,
@@ -774,13 +717,11 @@ void cleanup_cuda_unchecked_match_value_64(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);

 uint64_t scratch_cuda_cast_to_unsigned_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_input_blocks, uint32_t target_num_blocks, bool input_is_signed,
-    bool requires_full_propagate, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    bool input_is_signed, bool requires_full_propagate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cast_to_unsigned_64_async(CudaStreamsFFI streams,
@@ -794,14 +735,12 @@ void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_match_value_or_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_matches, uint32_t num_input_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
    uint32_t max_output_is_zero, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_match_value_or_64_async(
@@ -815,12 +754,10 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_contains_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
@@ -835,12 +772,10 @@ void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
                                        int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_contains_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_clear_64_async(
@@ -853,12 +788,10 @@ void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_is_in_clears_64_async(CudaStreamsFFI streams,
@@ -873,12 +806,10 @@ void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
                                            int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_in_clears_64_async(
@@ -892,12 +823,10 @@ void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
                                               int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_unique, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_in_clears_64_async(
@@ -915,12 +844,10 @@ void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_of_clear_64_async(
@@ -934,12 +861,10 @@ void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
                                                    int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_first_index_of_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_of_64_async(
@@ -953,12 +878,10 @@ void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_index_of_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
@@ -974,12 +897,10 @@ void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
                                        int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_of_clear_64_async(
@@ -994,12 +915,10 @@ void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_all_eq_slices_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_all_eq_slices_64_async(
@@ -1012,12 +931,10 @@ void cleanup_cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
                                             int8_t **mem_ptr_void);

 uint64_t scratch_cuda_unchecked_contains_sub_slice_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_lhs, uint32_t num_rhs, uint32_t num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_lhs, uint32_t num_rhs,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_sub_slice_64_async(
@@ -1030,12 +947,10 @@ void cleanup_cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
                                                  int8_t **mem_ptr_void);

 uint64_t scratch_cuda_cast_to_signed_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_input_blocks,
-    uint32_t target_num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool input_is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool input_is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cast_to_signed_64_async(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -345,6 +345,21 @@ struct int_radix_params {
        message_modulus(message_modulus), carry_modulus(carry_modulus),
        noise_reduction_type(noise_reduction_type){};

+  int_radix_params(CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+                   uint32_t ks_base_log, uint32_t message_modulus,
+                   uint32_t carry_modulus,
+                   PBS_MS_REDUCTION_T noise_reduction_type)
+      : pbs_type((PBS_TYPE)bsk_params.pbs_type),
+        glwe_dimension(bsk_params.glwe_dimension),
+        polynomial_size(bsk_params.polynomial_size),
+        big_lwe_dimension(bsk_params.big_lwe_dimension),
+        small_lwe_dimension(bsk_params.input_lwe_dimension), ks_level(ks_level),
+        ks_base_log(ks_base_log), pbs_level(bsk_params.level_count),
+        pbs_base_log(bsk_params.base_log),
+        grouping_factor(bsk_params.grouping_factor),
+        message_modulus(message_modulus), carry_modulus(carry_modulus),
+        noise_reduction_type(noise_reduction_type){};
+
  int_radix_params() = default;

  void print() {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/rerand.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/rerand.h
@@ -2,12 +2,17 @@

 #include "integer.h"

+enum RERAND_MODE {
+  RERAND_WITH_KS = 0,
+  RERAND_WITHOUT_KS = 1,
+};
+
 extern "C" {
 uint64_t scratch_cuda_rerand_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, bool allocate_gpu_memory);
+    uint32_t carry_modulus, bool allocate_gpu_memory, RERAND_MODE rerand_type);

 void cuda_rerand_64_async(
    CudaStreamsFFI streams, void *lwe_array,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h
@@ -3,16 +3,18 @@
 #include "checked_arithmetic.h"
 #include "integer_utilities.h"
 #include "keyswitch/ks_enums.h"
+#include "rerand.h"
 #include "zk/expand.cuh"
 #include "zk/zk_utilities.h"

 template <typename Torus> struct int_rerand_mem {
  int_radix_params params;

-  Torus *tmp_zero_lwes;
-  Torus *tmp_ksed_zero_lwes;
-  Torus *lwe_trivial_indexes;
+  Torus *tmp_expanded_zero_lwes = nullptr;
+  Torus *tmp_ksed_expanded_zero_lwes = nullptr;
+  Torus *lwe_trivial_indexes = nullptr;
  uint32_t num_lwes;
+  RERAND_MODE rerand_mode;

  bool gpu_memory_allocated;

@@ -20,24 +22,20 @@ template <typename Torus> struct int_rerand_mem {
      ks_tmp_buf_vec; // not allocated, ReRand not using GEMM KS for now
  // kept empty to pass to the KS function indicating GEMM KS disabled

-  expand_job<Torus> *d_expand_jobs;
-  expand_job<Torus> *h_expand_jobs;
+  expand_job<Torus> *d_expand_jobs = nullptr;
+  expand_job<Torus> *h_expand_jobs = nullptr;

  int_rerand_mem(CudaStreams streams, int_radix_params params,
-                 const uint32_t num_lwes, const bool allocate_gpu_memory,
-                 uint64_t &size_tracker)
-      : params(params), num_lwes(num_lwes),
+                 const uint32_t num_lwes, const RERAND_MODE rerand_mode,
+                 const bool allocate_gpu_memory, uint64_t &size_tracker)
+      : params(params), num_lwes(num_lwes), rerand_mode(rerand_mode),
        gpu_memory_allocated(allocate_gpu_memory) {

-    tmp_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, params.big_lwe_dimension + 1),
-        streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
-
-    tmp_ksed_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, params.small_lwe_dimension + 1),
-        streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
+    tmp_expanded_zero_lwes =
+        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
+            safe_mul_sizeof<Torus>(num_lwes, params.big_lwe_dimension + 1),
+            streams.stream(0), streams.gpu_index(0), size_tracker,
+            allocate_gpu_memory));

    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
@@ -46,47 +44,63 @@ template <typename Torus> struct int_rerand_mem {

    h_expand_jobs = static_cast<expand_job<Torus> *>(
        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
+    PANIC_IF_FALSE(h_expand_jobs != nullptr,
+                   "host allocation failed for h_expand_jobs");

-    auto h_lwe_trivial_indexes =
-        static_cast<Torus *>(malloc(safe_mul_sizeof<Torus>(num_lwes)));
-    for (auto i = 0; i < num_lwes; ++i) {
-      h_lwe_trivial_indexes[i] = i;
+    if (rerand_mode == RERAND_MODE::RERAND_WITH_KS) {
+      tmp_ksed_expanded_zero_lwes =
+          static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
+              safe_mul_sizeof<Torus>(num_lwes, params.small_lwe_dimension + 1),
+              streams.stream(0), streams.gpu_index(0), size_tracker,
+              allocate_gpu_memory));
+
+      auto h_lwe_trivial_indexes =
+          static_cast<Torus *>(malloc(safe_mul_sizeof<Torus>(num_lwes)));
+      PANIC_IF_FALSE(h_lwe_trivial_indexes != nullptr,
+                     "host allocation failed for h_lwe_trivial_indexes");
+      for (uint32_t i = 0; i < num_lwes; ++i) {
+        h_lwe_trivial_indexes[i] = i;
+      }
+      lwe_trivial_indexes =
+          static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
+              safe_mul_sizeof<Torus>(num_lwes), streams.stream(0),
+              streams.gpu_index(0), size_tracker, allocate_gpu_memory));
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_trivial_indexes,
+                               safe_mul_sizeof<Torus>(num_lwes),
+                               streams.stream(0), streams.gpu_index(0));
+      cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+      free(h_lwe_trivial_indexes);
+    } else {
+      cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    }
-    lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes), streams.stream(0),
-        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
-    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_trivial_indexes,
-                             safe_mul_sizeof<Torus>(num_lwes),
-                             streams.stream(0), streams.gpu_index(0));
-
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-
-    free(h_lwe_trivial_indexes);
  }

  void release(CudaStreams streams) {
-    cuda_drop_with_size_tracking_async(tmp_zero_lwes, streams.stream(0),
-                                       streams.gpu_index(0),
+    cuda_drop_with_size_tracking_async(tmp_expanded_zero_lwes,
+                                       streams.stream(0), streams.gpu_index(0),
                                       gpu_memory_allocated);
-    tmp_zero_lwes = nullptr;
-    cuda_drop_with_size_tracking_async(tmp_ksed_zero_lwes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    tmp_ksed_zero_lwes = nullptr;
-    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    lwe_trivial_indexes = nullptr;
+    tmp_expanded_zero_lwes = nullptr;
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    d_expand_jobs = nullptr;

-    for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
-      cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
-                             ks_tmp_buf_vec[i], gpu_memory_allocated);
+    if (rerand_mode == RERAND_MODE::RERAND_WITH_KS) {
+      cuda_drop_with_size_tracking_async(
+          tmp_ksed_expanded_zero_lwes, streams.stream(0), streams.gpu_index(0),
+          gpu_memory_allocated);
+      tmp_ksed_expanded_zero_lwes = nullptr;
+      cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
+                                         streams.gpu_index(0),
+                                         gpu_memory_allocated);
+      lwe_trivial_indexes = nullptr;
+
+      for (size_t i = 0; i < ks_tmp_buf_vec.size(); i++) {
+        cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
+                               ks_tmp_buf_vec[i], gpu_memory_allocated);
+      }
+      ks_tmp_buf_vec.clear();
    }
-    ks_tmp_buf_vec.clear();

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_expand_jobs);
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
@@ -5,12 +5,11 @@

 extern "C" {
 uint64_t scratch_cuda_kreyvium_generate_keystream_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_inputs);

 void cuda_kreyvium_generate_keystream_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -64,6 +64,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
    void const *lwe_array_in, const uint64_t plaintext_in,
    const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_inplace_32(
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
+    CudaRadixCiphertextFFI const *input_2);
+void cuda_add_lwe_ciphertext_vector_inplace_64(
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
+    CudaRadixCiphertextFFI const *input_2);
 }

 #endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
@@ -5,12 +5,11 @@

 extern "C" {
 uint64_t scratch_cuda_trivium_generate_keystream_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_inputs);

 void cuda_trivium_generate_keystream_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
  uint32_t num_lwes;
  uint32_t num_compact_lists;

-  int_radix_lut<Torus> *message_and_carry_extract_luts;
-  int_radix_lut<Torus> *identity_lut;
+  int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
+  int_radix_lut<Torus> *identity_lut = nullptr;

-  Torus *tmp_expanded_lwes;
-  Torus *tmp_ksed_small_to_big_expanded_lwes;
+  Torus *tmp_expanded_lwes = nullptr;
+  Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;

  bool gpu_memory_allocated;

@@ -148,66 +148,6 @@ template <typename Torus> struct zk_expand_mem {
      PANIC("GPU backend requires carry_modulus equal to message_modulus")
    }

-    // We create the identity LUT only if we are doing a SANITY_CHECK
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut =
-          new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
-                                   allocate_gpu_memory, size_tracker);
-
-      auto identity_lut_f = [](Torus x) -> Torus { return x; };
-
-      identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
-                                               LUT_0_FOR_ALL_BLOCKS);
-    }
-
-    auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
-      return x % casting_params.message_modulus;
-    };
-    auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
-      return (x / casting_params.carry_modulus) %
-             casting_params.message_modulus;
-    };
-
-    // Booleans have to be sanitized
-    auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
-    auto message_extract_and_sanitize_bool_lut_f =
-        [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-      return sanitize_bool_f(message_extract_lut_f(x));
-    };
-    auto carry_extract_and_sanitize_bool_lut_f =
-        [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
-      return sanitize_bool_f(carry_extract_lut_f(x));
-    };
-
-    /** In case the casting key casts from BIG to SMALL key we run a single KS
-    to expand using the casting key as ksk. Otherwise, in case the casting key
-    casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
-    the casting key as ksk, then we keyswitch from BIG to SMALL using the
-    computing ksk, and lastly we apply the PBS. The output is always on the
-    BIG key.
-    **/
-    auto params = casting_params;
-    if (casting_key_type == SMALL_TO_BIG) {
-      params = computing_params;
-    }
-    message_and_carry_extract_luts = new int_radix_lut<Torus>(
-        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
-
-    // We are always packing two LWEs. We just need to be sure we have enough
-    // space in the carry part to store a message of the same size as is in the
-    // message part.
-    if (params.carry_modulus < params.message_modulus)
-      PANIC("Carry modulus must be at least as large as message modulus");
-    auto num_packed_msgs = 2;
-
-    // Adjust indexes to permute the output and access the correct LUT
-    auto h_indexes_in = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-    auto h_indexes_out = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-    auto h_lut_indexes = static_cast<Torus *>(
-        malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
-
    d_expand_jobs =
        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
            safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
@@ -216,155 +156,202 @@ template <typename Torus> struct zk_expand_mem {
    h_expand_jobs = static_cast<expand_job<Torus> *>(
        malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));

-    /*
-     * Each LWE contains encrypted data in both carry and message spaces
-     * that needs to be extracted.
-     *
-     * The loop processes each compact list (k) and for each LWE within that
-     * list:
-     * 1. Sets input indexes to read each LWE twice (for carry and message
-     * extraction)
-     * 2. Creates output indexes to properly reorder the results
-     * 3. Selects appropriate LUT index based on whether boolean sanitization is
-     * needed
-     *
-     * We want the output to have always first the content of the message part
-     * and then the content of the carry part of each LWE.
-     *
-     * i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
-     * carry_extract(LWE_1), ...
-     *
-     * Aiming that behavior, with 4 LWEs we would have:
-     *
-     * // Each LWE is processed twice
-     * h_indexes_in   = {0, 1, 2, 3, 0, 1, 2, 3}
-     *
-     * // First 4 use message LUT, last 4 use carry LUT
-     * h_lut_indexes  = {0, 0, 0, 0, 1, 1, 1, 1}
-     *
-     * // Reorders output so message and carry for each LWE appear together
-     * h_indexes_out  = {0, 2, 4, 6, 1, 3, 5, 7}
-     *
-     * If an LWE contains a boolean value, its LUT index is shifted by
-     * num_packed_msgs to use the sanitization LUT (which ensures output is
-     * exactly 0 or 1).
-     */
-    auto offset = 0;
-    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
-      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
-        auto lwe_index = i + num_packed_msgs * offset;
-        auto lwe_index_in_list = i % num_lwes_in_kth;
-        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
-                       "Cuda error: index %d is beyond the max value %d",
-                       lwe_index, num_packed_msgs * num_lwes);
-        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
-        h_indexes_out[lwe_index] =
-            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %lu is beyond the max value %lu",
-                       (unsigned long)h_indexes_in[lwe_index],
-                       (unsigned long)(num_packed_msgs * num_lwes));
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %lu is beyond the max value %lu",
-                       (unsigned long)h_indexes_out[lwe_index],
-                       (unsigned long)(num_packed_msgs * num_lwes));
-        // is_boolean_array tells us which input is a boolean and thus the
-        // related output needs boolean sanitization. It naturally has
-        // total_blocks entries, but h_indexes_out reaches
-        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
-        // the ceiling causes out-of-bounds access. Reading garbage "true" would
-        // set h_lut_indexes to an invalid index pointing to uninitialized
-        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
-        // to match.
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
-                       "Cuda error: index %lu for is_boolean_array is out of "
-                       "bounds (len is %lu)",
-                       (unsigned long)h_indexes_out[lwe_index],
-                       (unsigned long)is_boolean_array_len);
+    // NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
+    // no intermediate buffers needed.
+    if (expand_kind != EXPAND_KIND::NO_CASTING) {
+      /** In case the casting key casts from BIG to SMALL key we run a single KS
+      to expand using the casting key as ksk. Otherwise, in case the casting key
+      casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
+      the casting key as ksk, then we keyswitch from BIG to SMALL using the
+      computing ksk, and lastly we apply the PBS. The output is always on the
+      BIG key.
+      **/
+      auto params = casting_params;
+      if (casting_key_type == SMALL_TO_BIG) {
+        params = computing_params;
      }
-      offset += num_lwes_in_kth;
-    }

-    message_and_carry_extract_luts->set_lwe_indexes(
-        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
+      // We always pack two LWEs (message and carry parts per LWE)
+      auto num_packed_msgs = 2;

-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
-                                    h_indexes_in, h_indexes_out);
-    }
+      // Adjust indexes to permute the output and access the correct LUT.
+      //
+      // The loop below fills h_indexes_in and h_indexes_out so that the output
+      // is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
+      // msg_extract(LWE_1), carry_extract(LWE_1), ...
+      //
+      // With 4 LWEs the arrays look like:
+      //   h_indexes_in  = {0, 1, 2, 3, 0, 1, 2, 3}  (each LWE read twice)
+      //   h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}  (msg LUT then carry LUT)
+      //   h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}  (interleaved output)
+      //
+      // If an LWE contains a boolean its LUT index is shifted by
+      // num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
+      auto h_indexes_in = static_cast<Torus *>(
+          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+      auto h_indexes_out = static_cast<Torus *>(
+          malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));

-    auto active_streams =
-        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
-
-    // Index generator for message/carry extraction LUTs
-    auto index_gen = [num_compact_lists,
-                      num_lwes_per_compact_list =
-                          this->num_lwes_per_compact_list,
-                      num_packed_msgs, is_boolean_array,
-                      h_indexes_out](Torus *h_lut_indexes, uint32_t) {
      auto offset = 0;
      for (int k = 0; k < num_compact_lists; k++) {
-        auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+        auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
        for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
          auto lwe_index = i + num_packed_msgs * offset;
-          auto boolean_offset =
-              is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
-          h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+          auto lwe_index_in_list = i % num_lwes_in_kth;
+          PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                         "Cuda error: index %d is beyond the max value %d",
+                         lwe_index, num_packed_msgs * num_lwes);
+          h_indexes_in[lwe_index] = lwe_index_in_list + offset;
+          h_indexes_out[lwe_index] =
+              num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
+          PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                         "Cuda error: index %lu is beyond the max value %lu",
+                         (unsigned long)h_indexes_in[lwe_index],
+                         (unsigned long)(num_packed_msgs * num_lwes));
+          PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                         "Cuda error: index %lu is beyond the max value %lu",
+                         (unsigned long)h_indexes_out[lwe_index],
+                         (unsigned long)(num_packed_msgs * num_lwes));
+          // is_boolean_array tells us which input is a boolean and thus the
+          // related output needs boolean sanitization. It naturally has
+          // total_blocks entries, but h_indexes_out reaches
+          // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
+          // odd, the ceiling causes out-of-bounds access. Reading garbage
+          // "true" would set h_lut_indexes to an invalid index pointing to
+          // uninitialized memory instead of a real LUT. Rust pads
+          // is_boolean_array with FALSE to match.
+          PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                         "Cuda error: index %lu for is_boolean_array is out of "
+                         "bounds (len is %lu)",
+                         (unsigned long)h_indexes_out[lwe_index],
+                         (unsigned long)is_boolean_array_len);
        }
        offset += num_lwes_in_kth;
      }
-    };

-    message_and_carry_extract_luts->generate_and_broadcast_lut(
-        active_streams, {0, 1, 2, 3},
-        {message_extract_lut_f, carry_extract_lut_f,
-         message_extract_and_sanitize_bool_lut_f,
-         carry_extract_and_sanitize_bool_lut_f},
-        index_gen, true, {}, h_lut_indexes);
+      auto active_streams =
+          streams.active_gpu_subset(2 * num_lwes, params.pbs_type);

-    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
-        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
+      // SANITY_CHECK uses identity_lut (skipping the full message/carry
+      // extraction LUT and the SMALL_TO_BIG intermediate buffer).
+      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        identity_lut =
+            new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
+                                     allocate_gpu_memory, size_tracker);

-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
-          active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
+        auto identity_lut_f = [](Torus x) -> Torus { return x; };
+        identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
+                                                 LUT_0_FOR_ALL_BLOCKS);
+        identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
+                                      h_indexes_in, h_indexes_out);
+        identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
+            active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
+      } else {
+        // We are always packing two LWEs. We just need to be sure we have
+        // enough space in the carry part to store a message of the same size
+        // as is in the message part.
+        if (params.carry_modulus < params.message_modulus)
+          PANIC("Carry modulus must be at least as large as message modulus");
+
+        message_and_carry_extract_luts =
+            new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
+                                     allocate_gpu_memory, size_tracker);
+        message_and_carry_extract_luts->set_lwe_indexes(
+            streams.stream(0), streams.gpu_index(0), h_indexes_in,
+            h_indexes_out);
+
+        auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
+          return x % casting_params.message_modulus;
+        };
+        auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
+          return (x / casting_params.carry_modulus) %
+                 casting_params.message_modulus;
+        };
+        auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
+        auto message_extract_and_sanitize_bool_lut_f =
+            [message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+          return sanitize_bool_f(message_extract_lut_f(x));
+        };
+        auto carry_extract_and_sanitize_bool_lut_f =
+            [carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
+          return sanitize_bool_f(carry_extract_lut_f(x));
+        };
+
+        auto h_lut_indexes = static_cast<Torus *>(
+            malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
+
+        auto index_gen = [num_compact_lists,
+                          num_lwes_per_compact_list =
+                              this->num_lwes_per_compact_list,
+                          num_packed_msgs, is_boolean_array,
+                          h_indexes_out](Torus *h_lut_indexes, uint32_t) {
+          auto offset = 0;
+          for (int k = 0; k < num_compact_lists; k++) {
+            auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+            for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+              auto lwe_index = i + num_packed_msgs * offset;
+              auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
+                                        ? num_packed_msgs
+                                        : 0;
+              h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+            }
+            offset += num_lwes_in_kth;
+          }
+        };
+
+        message_and_carry_extract_luts->generate_and_broadcast_lut(
+            active_streams, {0, 1, 2, 3},
+            {message_extract_lut_f, carry_extract_lut_f,
+             message_extract_and_sanitize_bool_lut_f,
+             carry_extract_and_sanitize_bool_lut_f},
+            index_gen, true, {}, h_lut_indexes);
+        message_and_carry_extract_luts
+            ->allocate_lwe_vector_for_non_trivial_indexes(
+                active_streams, 2 * num_lwes, size_tracker,
+                allocate_gpu_memory);
+        free(h_lut_indexes);
+
+        // SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
+        // on the full casting path.
+        tmp_ksed_small_to_big_expanded_lwes =
+            (Torus *)cuda_malloc_with_size_tracking_async(
+                safe_mul_sizeof<Torus>(num_lwes,
+                                       casting_params.big_lwe_dimension + 1),
+                streams.stream(0), streams.gpu_index(0), size_tracker,
+                allocate_gpu_memory);
+      }
+
+      // The expanded LWEs will always be on the casting key format
+      tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
+          safe_mul_sizeof<Torus>(num_lwes,
+                                 casting_params.big_lwe_dimension + 1),
+          streams.stream(0), streams.gpu_index(0), size_tracker,
+          allocate_gpu_memory);
+
+      free(h_indexes_in);
+      free(h_indexes_out);
    }

-    // The expanded LWEs will always be on the casting key format
-    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
-        safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
-        streams.stream(0), streams.gpu_index(0), size_tracker,
-        allocate_gpu_memory);
-
-    tmp_ksed_small_to_big_expanded_lwes =
-        (Torus *)cuda_malloc_with_size_tracking_async(
-            safe_mul_sizeof<Torus>(num_lwes,
-                                   casting_params.big_lwe_dimension + 1),
-            streams.stream(0), streams.gpu_index(0), size_tracker,
-            allocate_gpu_memory);
-
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-    free(h_indexes_in);
-    free(h_indexes_out);
-    free(h_lut_indexes);
  }

  void release(CudaStreams streams) {
-    message_and_carry_extract_luts->release(streams);
-    delete message_and_carry_extract_luts;
-
-    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
-      identity_lut->release(streams);
-      delete identity_lut;
+    if (expand_kind != EXPAND_KIND::NO_CASTING) {
+      if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+        identity_lut->release(streams);
+        delete identity_lut;
+      } else {
+        message_and_carry_extract_luts->release(streams);
+        delete message_and_carry_extract_luts;
+        cuda_drop_with_size_tracking_async(
+            tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
+            streams.gpu_index(0), gpu_memory_allocated);
+      }
+      cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
+                                         streams.gpu_index(0),
+                                         gpu_memory_allocated);
    }

-    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
-                                       streams.gpu_index(0),
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
-                                       streams.stream(0), streams.gpu_index(0),
-                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
@@ -2,19 +2,14 @@
 #include "aes.cuh"

 uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_aes_encrypt<uint64_t>(
      CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
@@ -22,19 +17,14 @@ uint64_t scratch_cuda_integer_aes_ctr_encrypt_64_async(
 }

 uint64_t scratch_cuda_integer_aes_ctr_256_encrypt_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
+    uint32_t num_aes_inputs, uint32_t sbox_parallelism) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_aes_encrypt<uint64_t>(
      CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
@@ -78,18 +68,13 @@ void cleanup_cuda_integer_aes_ctr_256_encrypt_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_key_expansion_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_key_expansion<uint64_t>(
      CudaStreams(streams), (int_key_expansion_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
@@ -390,7 +390,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_a[6], &wires_a[15], &input_bits[7]);
  XOR(&wires_a[10], &wires_a[15], &wires_b[0]);
  XOR(&wires_a[11], &wires_a[20], &wires_a[9]);
-  FLUSH(&wires_a[6], &wires_a[10]);
+  FLUSH(&wires_a[6], &wires_a[10], &wires_a[11]);
  XOR(&wires_a[7], &input_bits[7], &wires_a[11]);
  FLUSH(&wires_a[7]);
  XOR(&wires_a[17], &wires_a[10], &wires_a[11]);
@@ -426,7 +426,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[22], &wires_b[18], &wires_a[19]);
  XOR(&wires_b[23], &wires_b[19], &wires_a[21]);
  XOR(&wires_b[24], &wires_b[20], &wires_a[18]);
-  FLUSH(&wires_b[21], &wires_b[23], &wires_b[24]);
+  FLUSH(&wires_b[21], &wires_b[22], &wires_b[23], &wires_b[24]);
  XOR(&wires_b[25], &wires_b[21], &wires_b[22]);
  FLUSH(&wires_b[25]);

@@ -468,7 +468,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,

  XOR(&wires_b[37], &wires_b[36], &wires_b[34]);
  XOR(&wires_b[38], &wires_b[27], &wires_b[36]);
-  FLUSH(&wires_b[38]);
+  FLUSH(&wires_b[38], &wires_b[37]);
  XOR(&wires_b[44], &wires_b[33], &wires_b[37]);

  CudaRadixCiphertextFFI *and_outs_6[] = {&wires_b[39]};
@@ -479,7 +479,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[40], &wires_b[25], &wires_b[39]);
  XOR(&wires_b[41], &wires_b[40], &wires_b[37]);
  XOR(&wires_b[43], &wires_b[29], &wires_b[40]);
-  FLUSH(&wires_b[41]);
+  FLUSH(&wires_b[41], &wires_b[40], &wires_b[43], &wires_b[44]);
  XOR(&wires_b[45], &wires_b[42], &wires_b[41]);
  FLUSH(&wires_b[45]);

@@ -514,6 +514,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
  XOR(&wires_b[57], &wires_b[50], &wires_b[53]);
  XOR(&wires_b[58], &wires_c[4], &wires_b[46]);
  XOR(&wires_b[59], &wires_c[3], &wires_b[54]);
+  FLUSH(&wires_b[57], &wires_b[58]);
  XOR(&wires_b[60], &wires_b[46], &wires_b[57]);
  XOR(&wires_b[61], &wires_c[14], &wires_b[57]);
  XOR(&wires_b[62], &wires_b[52], &wires_b[58]);
@@ -589,6 +590,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
 #undef FLUSH
 #undef AND
 #undef ADD_ONE_FLUSH
+#undef ADD_ONE
 }

 /**
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes256.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes256.cu
@@ -14,18 +14,13 @@ void cuda_integer_aes_ctr_256_encrypt_64_async(
 }

 uint64_t scratch_cuda_integer_key_expansion_256_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_key_expansion_256<uint64_t>(
      CudaStreams(streams), (int_key_expansion_256_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -83,6 +83,8 @@ void cuda_modulus_switch_inplace_64_async(void *stream, uint32_t gpu_index,
 void cuda_modulus_switch_64_async(void *stream, uint32_t gpu_index,
                                  void *lwe_out, const void *lwe_in,
                                  uint32_t size, uint32_t log_modulus) {
+  PANIC_IF_FALSE(lwe_out != lwe_in, "Output and input pointers must be "
+                                    "different for out-of-place operations");
  host_modulus_switch<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
                                static_cast<uint64_t *>(lwe_out),
                                static_cast<const uint64_t *>(lwe_in), size,
@@ -93,6 +95,8 @@ void cuda_centered_modulus_switch_64_async(void *stream, uint32_t gpu_index,
                                           void *lwe_out, const void *lwe_in,
                                           uint32_t lwe_dimension,
                                           uint32_t log_modulus) {
+  PANIC_IF_FALSE(lwe_out != lwe_in, "Output and input pointers must be "
+                                    "different for out-of-place operations");
  host_centered_modulus_switch_inplace<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_out), static_cast<const uint64_t *>(lwe_in),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -217,6 +217,8 @@ void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
                                         void const *input, void *output,
                                         uint32_t base_log,
                                         uint32_t level_count) {
+  PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
+                                  "for out-of-place operations");
  host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
                                  static_cast<const uint64_t *>(input),
                                  static_cast<uint64_t *>(output), base_log,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -489,7 +489,7 @@ template <typename Torus>
 __host__ void host_modulus_switch_multi_bit(
    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
-  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  int multibit_size = size / grouping_factor;
  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -326,6 +326,10 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index) {
  if (size == 0)
    return;
+
+  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
+  GPU_ASSERT(dest != nullptr, "Cuda error: null device ptr");
+
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
  PANIC_IF_FALSE(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -2,17 +2,12 @@

 uint64_t scratch_cuda_integer_abs_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_abs<uint64_t>(
      CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,39 +1,32 @@
 #include "integer/bitwise_ops.cuh"

-uint64_t scratch_cuda_boolean_bitop_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+void cuda_boolean_bitop_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
+    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks) {
+  // In-place variant: lwe_array_inout op= lwe_array_2, no aliasing check needed
+  host_boolean_bitop<uint64_t>(
+      CudaStreams(streams), lwe_array_inout, lwe_array_inout, lwe_array_2,
+      (boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+}
+
+uint64_t scratch_cuda_boolean_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_boolean_bitop<uint64_t>(
      CudaStreams(streams), (boolean_bitop_buffer<uint64_t> **)mem_ptr,
      lwe_ciphertext_count, params, op_type, is_unchecked, allocate_gpu_memory);
 }

-void cuda_boolean_bitop_64_async(CudaStreamsFFI streams,
-                                 CudaRadixCiphertextFFI *lwe_array_out,
-                                 CudaRadixCiphertextFFI const *lwe_array_1,
-                                 CudaRadixCiphertextFFI const *lwe_array_2,
-                                 int8_t *mem_ptr, void *const *bsks,
-                                 void *const *ksks) {
-
-  host_boolean_bitop<uint64_t>(
-      CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
-      (boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
-}
-
-void cleanup_cuda_boolean_bitop_64(CudaStreamsFFI streams,
-                                   int8_t **mem_ptr_void) {
+void cleanup_cuda_boolean_bitop_inplace_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void) {

  boolean_bitop_buffer<uint64_t> *mem_ptr =
      (boolean_bitop_buffer<uint64_t> *)(*mem_ptr_void);
@@ -43,18 +36,13 @@ void cleanup_cuda_boolean_bitop_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_boolean_bitnot_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
    uint32_t lwe_ciphertext_count, bool is_unchecked, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_boolean_bitnot<uint64_t>(
      CudaStreams(streams), (boolean_bitnot_buffer<uint64_t> **)mem_ptr, params,
@@ -80,38 +68,28 @@ void cleanup_cuda_boolean_bitnot_64(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_cuda_integer_bitop_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+uint64_t scratch_cuda_integer_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_bitop<uint64_t>(
      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
 }

-uint64_t scratch_cuda_integer_scalar_bitop_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, BITOP_TYPE op_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_bitop<uint64_t>(
      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
@@ -129,20 +107,18 @@ void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
  cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
 }

-void cuda_integer_bitop_64_async(CudaStreamsFFI streams,
-                                 CudaRadixCiphertextFFI *lwe_array_out,
-                                 CudaRadixCiphertextFFI const *lwe_array_1,
-                                 CudaRadixCiphertextFFI const *lwe_array_2,
-                                 int8_t *mem_ptr, void *const *bsks,
-                                 void *const *ksks) {
-
-  host_bitop<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
+void cuda_integer_bitop_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
+    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks) {
+  // In-place variant: lwe_array_inout op= lwe_array_2, no aliasing check needed
+  host_bitop<uint64_t>(CudaStreams(streams), lwe_array_inout, lwe_array_inout,
                       lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
                       (uint64_t **)(ksks));
 }

-void cleanup_cuda_integer_bitop_64(CudaStreamsFFI streams,
-                                   int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
+                                           int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
@@ -151,8 +127,8 @@ void cleanup_cuda_integer_bitop_64(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

-void cleanup_cuda_integer_scalar_bitop_64(CudaStreamsFFI streams,
-                                          int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_scalar_bitop_inplace_64(CudaStreamsFFI streams,
+                                                  int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -3,6 +3,8 @@
 void extend_radix_with_trivial_zero_blocks_msb_64(
    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
    CudaStreamsFFI streams) {
+  PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
+                                  "for out-of-place operations");
  auto cuda_streams = CudaStreams(streams);
  host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(output, input,
                                                           cuda_streams);
@@ -12,6 +14,8 @@ void extend_radix_with_trivial_zero_blocks_msb_64(
 void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
                              CudaRadixCiphertextFFI const *input,
                              CudaStreamsFFI streams) {
+  PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
+                                  "for out-of-place operations");

  auto cuda_streams = CudaStreams(streams);
  host_trim_radix_blocks_lsb<uint64_t>(output, input, cuda_streams);
@@ -21,6 +25,8 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
 void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
                              CudaRadixCiphertextFFI const *input,
                              CudaStreamsFFI streams) {
+  PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
+                                  "for out-of-place operations");

  auto cuda_streams = CudaStreams(streams);
  host_trim_radix_blocks_msb<uint64_t>(output, input, cuda_streams);
@@ -28,19 +34,14 @@ void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
 }

 uint64_t scratch_cuda_cast_to_unsigned_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_input_blocks, uint32_t target_num_blocks, bool input_is_signed,
-    bool requires_full_propagate, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    bool input_is_signed, bool requires_full_propagate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_cast_to_unsigned<uint64_t>(
      CudaStreams(streams), (int_cast_to_unsigned_buffer<uint64_t> **)mem_ptr,
@@ -54,6 +55,8 @@ void cuda_cast_to_unsigned_64_async(CudaStreamsFFI streams,
                                    int8_t *mem_ptr, uint32_t target_num_blocks,
                                    bool input_is_signed, void *const *bsks,
                                    void *const *ksks) {
+  PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
+                                  "for out-of-place operations");

  host_cast_to_unsigned<uint64_t>(
      CudaStreams(streams), output, input,
@@ -72,19 +75,13 @@ void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_cast_to_signed_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_input_blocks,
-    uint32_t target_num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool input_is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_input_blocks, uint32_t target_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool input_is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_cast_to_signed<uint64_t>(
      CudaStreams(streams), (int_cast_to_signed_buffer<uint64_t> **)mem_ptr,
@@ -97,6 +94,8 @@ void cuda_cast_to_signed_64_async(CudaStreamsFFI streams,
                                  CudaRadixCiphertextFFI const *input,
                                  int8_t *mem, bool input_is_signed,
                                  void *const *bsks, void *const *ksks) {
+  PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
+                                  "for out-of-place operations");

  host_cast_to_signed<uint64_t>(CudaStreams(streams), output, input,
                                (int_cast_to_signed_buffer<uint64_t> *)mem,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,18 +1,16 @@
 #include "integer/cmux.cuh"

-uint64_t scratch_cuda_cmux_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+uint64_t scratch_cuda_cmux_64_async(CudaStreamsFFI streams, int8_t **mem_ptr,
+                                    CudaLweBootstrapKeyParamsFFI bsk_params,
+                                    uint32_t ks_level, uint32_t ks_base_log,
+                                    uint32_t lwe_ciphertext_count,
+                                    uint32_t message_modulus,
+                                    uint32_t carry_modulus,
+                                    bool allocate_gpu_memory,
+                                    PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch cmux")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };
@@ -30,6 +28,18 @@ void cuda_cmux_64_async(CudaStreamsFFI streams,
                        CudaRadixCiphertextFFI const *lwe_array_true,
                        CudaRadixCiphertextFFI const *lwe_array_false,
                        int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+  PANIC_IF_FALSE(
+      lwe_array_out != lwe_condition,
+      "Output and condition pointers must be different for out-of-place "
+      "operations");
+  PANIC_IF_FALSE(
+      lwe_array_out != lwe_array_true,
+      "Output and true-branch pointers must be different for out-of-place "
+      "operations");
+  PANIC_IF_FALSE(
+      lwe_array_out != lwe_array_false,
+      "Output and false-branch pointers must be different for out-of-place "
+      "operations");
  PUSH_RANGE("cmux")
  host_cmux<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_condition,
                      lwe_array_true, lwe_array_false,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,18 +1,14 @@
 #include "integer/comparison.cuh"

 uint64_t scratch_cuda_integer_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, COMPARISON_TYPE op_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch comparison")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  uint64_t size_tracker = 0;
  switch (op_type) {
@@ -38,18 +34,14 @@ uint64_t scratch_cuda_integer_comparison_64_async(
 }

 uint64_t scratch_cuda_integer_scalar_comparison_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, COMPARISON_TYPE op_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch scalar comparison")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  uint64_t size_tracker = 0;
  switch (op_type) {
@@ -80,6 +72,12 @@ void cuda_integer_comparison_64_async(CudaStreamsFFI streams,
                                      CudaRadixCiphertextFFI const *lwe_array_2,
                                      int8_t *mem_ptr, void *const *bsks,
                                      void *const *ksks) {
+  PANIC_IF_FALSE(lwe_array_out != lwe_array_1,
+                 "Output and first input pointers must be different for "
+                 "out-of-place operations");
+  PANIC_IF_FALSE(lwe_array_out != lwe_array_2,
+                 "Output and second input pointers must be different for "
+                 "out-of-place operations");
  PUSH_RANGE("comparison")
  if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
    PANIC("Cuda error: input num radix blocks must be the same")
@@ -145,18 +143,13 @@ void cleanup_cuda_integer_scalar_comparison_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_comparison_check<uint64_t>(
      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
@@ -167,6 +160,9 @@ void cuda_integer_are_all_comparisons_block_true_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
+  PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -187,18 +183,13 @@ void cleanup_cuda_integer_are_all_comparisons_block_true_64(
 }

 uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_comparison_check<uint64_t>(
      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
@@ -209,6 +200,9 @@ void cuda_integer_is_at_least_one_comparisons_block_true_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
+  PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -23,22 +23,24 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr,
    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
-    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_decompress,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t num_blocks_to_decompress,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  // Decompression doesn't keyswitch, so big and small dimensions are the same
  int_radix_params encryption_params(
-      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log,
-      grouping_factor, message_modulus, carry_modulus, noise_reduction_type);
+      (PBS_TYPE)bsk_params.pbs_type, encryption_glwe_dimension,
+      encryption_polynomial_size, bsk_params.big_lwe_dimension,
+      bsk_params.big_lwe_dimension, 0, 0, bsk_params.level_count,
+      bsk_params.base_log, bsk_params.grouping_factor, message_modulus,
+      carry_modulus, noise_reduction_type);

  int_radix_params compression_params(
-      pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
-      0, 0, pbs_level, pbs_base_log, grouping_factor, message_modulus,
-      carry_modulus, noise_reduction_type);
+      (PBS_TYPE)bsk_params.pbs_type, compression_glwe_dimension,
+      compression_polynomial_size, bsk_params.big_lwe_dimension,
+      compression_glwe_dimension * compression_polynomial_size, 0, 0,
+      bsk_params.level_count, bsk_params.base_log, bsk_params.grouping_factor,
+      message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
      CudaStreams(streams), (int_decompression<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -2,17 +2,13 @@

 uint64_t scratch_cuda_integer_div_rem_64_async(
    CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch div")
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_div_rem<uint64_t>(
      CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
@@ -27,6 +23,21 @@ void cuda_integer_div_rem_64_async(CudaStreamsFFI streams,
                                   CudaRadixCiphertextFFI const *divisor,
                                   bool is_signed, int8_t *mem_ptr,
                                   void *const *bsks, void *const *ksks) {
+  PANIC_IF_FALSE(quotient != numerator,
+                 "Quotient and numerator pointers must be different for "
+                 "out-of-place operations");
+  PANIC_IF_FALSE(quotient != divisor,
+                 "Quotient and divisor pointers must be different for "
+                 "out-of-place operations");
+  PANIC_IF_FALSE(remainder != numerator,
+                 "Remainder and numerator pointers must be different for "
+                 "out-of-place operations");
+  PANIC_IF_FALSE(remainder != divisor,
+                 "Remainder and divisor pointers must be different for "
+                 "out-of-place operations");
+  PANIC_IF_FALSE(quotient != remainder,
+                 "Quotient and remainder pointers must be different for "
+                 "out-of-place operations");
  PUSH_RANGE("div")
  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
@@ -1,19 +1,14 @@
 #include "ilog2.cuh"

 uint64_t scratch_cuda_integer_count_of_consecutive_bits_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t counter_num_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    Direction direction, BitValue bit_value, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t counter_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, Direction direction,
+    BitValue bit_value, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_count_of_consecutive_bits<uint64_t>(
      CudaStreams(streams), params,
@@ -30,6 +25,9 @@ void cuda_integer_count_of_consecutive_bits_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks) {
+  PANIC_IF_FALSE(output_ct != input_ct,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  host_integer_count_of_consecutive_bits<uint64_t, uint64_t>(
      CudaStreams(streams), output_ct, input_ct,
@@ -50,19 +48,14 @@ void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_ilog2_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t input_num_blocks, uint32_t counter_num_blocks,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    uint32_t input_num_blocks, uint32_t counter_num_blocks,
    uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_ilog2<uint64_t>(
      CudaStreams(streams), params, (int_ilog2_buffer<uint64_t> **)mem_ptr,
@@ -81,6 +74,9 @@ void cuda_integer_ilog2_64_async(
    CudaRadixCiphertextFFI const *trivial_ct_2,
    CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks) {
+  PANIC_IF_FALSE(output_ct != input_ct,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  host_integer_ilog2<uint64_t, uint64_t>(
      CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -15,17 +15,12 @@ void cuda_full_propagation_64_inplace_async(
 }

 uint64_t scratch_cuda_full_propagation_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_full_propagation<uint64_t>(
      CudaStreams(streams), (int_fullprop_buffer<uint64_t> **)mem_ptr, params,
@@ -44,17 +39,13 @@ void cleanup_cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
      CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
@@ -62,17 +53,13 @@ uint64_t scratch_cuda_propagate_single_carry_64_inplace_async(
 }

 uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
      CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
@@ -80,17 +67,13 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace_async(
 }

 uint64_t scratch_cuda_integer_overflowing_sub_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t compute_overflow, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_overflowing_sub<uint64_t>(
      CudaStreams(streams), (int_borrow_prop_memory<uint64_t> **)mem_ptr,
@@ -170,17 +153,12 @@ void cleanup_cuda_integer_overflowing_sub_64_inplace(CudaStreamsFFI streams,

 uint64_t scratch_cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint64_t lut_degree, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_apply_univariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
@@ -190,17 +168,12 @@ uint64_t scratch_cuda_apply_univariate_lut_64_async(

 uint64_t scratch_cuda_apply_many_univariate_lut_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_many_lut, uint64_t lut_degree,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_apply_many_univariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
@@ -212,6 +185,9 @@ void cuda_apply_univariate_lut_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks) {
+  PANIC_IF_FALSE(output_radix_lwe != input_radix_lwe,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  host_apply_univariate_lut<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
@@ -243,6 +219,9 @@ void cuda_apply_many_univariate_lut_64_async(
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks, uint32_t num_many_lut,
    uint32_t lut_stride) {
+  PANIC_IF_FALSE(output_radix_lwe != input_radix_lwe,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  host_apply_many_univariate_lut<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
@@ -288,19 +267,14 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
 }

 uint64_t scratch_cuda_apply_noise_squashing_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t original_num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t original_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_apply_noise_squashing_mem(
      streams, params, (int_noise_squashing_lut<uint64_t> **)mem_ptr,
@@ -312,6 +286,9 @@ void cuda_apply_noise_squashing_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks) {
+  PANIC_IF_FALSE(output_radix_lwe != input_radix_lwe,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  PUSH_RANGE("apply noise squashing")
  integer_radix_apply_noise_squashing<uint64_t>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -61,23 +61,73 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,

  total_count = message_count + carry_count;
 }
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the integer radix multiplication in keyswitch->bootstrap order.
- */
-uint64_t scratch_cuda_integer_mult_64_async(
+void cuda_integer_mult_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_inout,
+    bool const is_bool_left, CudaRadixCiphertextFFI const *radix_lwe_right,
+    bool const is_bool_right, void *const *bsks, void *const *ksks,
+    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
+  // In-place variant: radix_lwe_inout *= radix_lwe_right, no aliasing check
+  // needed
+  PUSH_RANGE("mul_inplace")
+  switch (polynomial_size) {
+  case 256:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 512:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 1024:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 2048:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 4096:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 8192:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 16384:
+    host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
+        CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
+        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  default:
+    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
+          "Supported N's are powers of two in the interval [256..16384].")
+  }
+  POP_RANGE()
+}
+
+uint64_t scratch_cuda_integer_mult_inplace_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
    bool const is_boolean_right, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          polynomial_size * glwe_dimension, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    uint32_t carry_modulus, CudaLweBootstrapKeyParamsFFI bsk_params,
+    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_radix_blocks,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  const uint32_t polynomial_size = bsk_params.polynomial_size;
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  switch (polynomial_size) {
  case 256:
@@ -97,94 +147,8 @@ uint64_t scratch_cuda_integer_mult_64_async(
  }
 }

-/*
- * Computes a multiplication between two 64 bit radix lwe ciphertexts
- * encrypting integer values. keyswitch -> bootstrap pattern is used, function
- * works for single pair of radix ciphertexts, 'v_stream' can be used for
- * parallelization
- * - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - 'gpu_index' is the index of the GPU to be used in the kernel launch
- * - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
- * multiplication
- * - 'radix_lwe_left' left radix big lwe ciphertext
- * - 'radix_lwe_right' right radix big lwe ciphertext
- * - 'bsk' bootstrapping key in fourier domain
- * - 'ksk' keyswitching key
- * - 'mem_ptr'
- * - 'message_modulus' message_modulus
- * - 'carry_modulus' carry_modulus
- * - 'glwe_dimension' glwe_dimension
- * - 'lwe_dimension' is the dimension of small lwe ciphertext
- * - 'polynomial_size' polynomial size
- * - 'pbs_base_log' base log used in the pbs
- * - 'pbs_level' decomposition level count used in the pbs
- * - 'ks_level' decomposition level count used in the keyswitch
- * - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
- * ciphertext
- * - 'pbs_type' selects which PBS implementation should be used
- */
-void cuda_integer_mult_64_async(CudaStreamsFFI streams,
-                                CudaRadixCiphertextFFI *radix_lwe_out,
-                                CudaRadixCiphertextFFI const *radix_lwe_left,
-                                bool const is_bool_left,
-                                CudaRadixCiphertextFFI const *radix_lwe_right,
-                                bool const is_bool_right, void *const *bsks,
-                                void *const *ksks, int8_t *mem_ptr,
-                                uint32_t polynomial_size, uint32_t num_blocks) {
-  PUSH_RANGE("mul")
-  switch (polynomial_size) {
-  case 256:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  case 512:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  case 1024:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  case 2048:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  case 4096:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  case 8192:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  case 16384:
-    host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
-        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
-        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
-    break;
-  default:
-    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Supported N's are powers of two in the interval [256..16384].")
-  }
-  POP_RANGE()
-}
-
-void cleanup_cuda_integer_mult_64(CudaStreamsFFI streams,
-                                  int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_mult_inplace_64(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup mul")
  int_mul_memory<uint64_t> *mem_ptr =
      (int_mul_memory<uint64_t> *)(*mem_ptr_void);
@@ -196,20 +160,14 @@ void cleanup_cuda_integer_mult_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_in_radix,
    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    uint32_t carry_modulus, bool reduce_degrees_for_single_carry_propagation,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);
  return scratch_cuda_integer_partial_sum_ciphertexts_vec<uint64_t>(
      CudaStreams(streams),
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
@@ -221,6 +179,9 @@ void cuda_partial_sum_ciphertexts_vec_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks) {
+  PANIC_IF_FALSE(radix_lwe_out != radix_lwe_vec,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -5,6 +5,9 @@ void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
                               CudaRadixCiphertextFFI const *lwe_array_in,
                               uint32_t message_modulus, uint32_t carry_modulus,
                               uint32_t num_radix_blocks) {
+  PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  auto cuda_streams = CudaStreams(streams);
  host_negation<uint64_t>(cuda_streams, lwe_array_out, lwe_array_in,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
@@ -1,19 +1,14 @@
 #include "integer/oprf.cuh"

 uint64_t scratch_cuda_integer_grouped_oprf_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_to_process,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_to_process,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t total_random_bits,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_grouped_oprf<uint64_t>(
      CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
@@ -45,20 +40,14 @@ void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_grouped_oprf_custom_range_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_intermediate,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t num_input_random_bits, uint32_t num_scalar_bits,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks_intermediate,
+    uint32_t message_modulus, uint32_t carry_modulus, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t num_input_random_bits,
+    uint32_t num_scalar_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_grouped_oprf_custom_range<uint64_t>(
      CudaStreams(streams),
@@ -72,13 +61,13 @@ void cuda_integer_grouped_oprf_custom_range_64_async(
    uint32_t num_blocks_intermediate, const void *seeded_lwe_input,
    const uint64_t *decomposed_scalar, const uint64_t *has_at_least_one_set,
    uint32_t num_scalars, uint32_t shift, int8_t *mem, void *const *bsks,
-    void *const *ksks) {
+    void *const *compute_bsks, void *const *ksks) {

  host_integer_grouped_oprf_custom_range<uint64_t>(
      CudaStreams(streams), radix_lwe_out, num_blocks_intermediate,
      (const uint64_t *)seeded_lwe_input, decomposed_scalar,
      has_at_least_one_set, num_scalars, shift,
-      (int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks,
+      (int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks, compute_bsks,
      (uint64_t *const *)ksks);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -114,7 +114,7 @@ void host_integer_grouped_oprf_custom_range(
    const Torus *decomposed_scalar, const Torus *has_at_least_one_set,
    uint32_t num_scalars, uint32_t shift,
    int_grouped_oprf_custom_range_memory<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks) {
+    void *const *compute_bsks, Torus *const *ksks) {

  CudaRadixCiphertextFFI *computation_buffer = mem_ptr->tmp_oprf_output;
  set_zero_radix_ciphertext_slice_async<Torus>(
@@ -127,12 +127,12 @@ void host_integer_grouped_oprf_custom_range(

  host_integer_scalar_mul_radix<Torus>(
      streams, computation_buffer, decomposed_scalar, has_at_least_one_set,
-      mem_ptr->scalar_mul_buffer, bsks, ksks, mem_ptr->params.message_modulus,
-      num_scalars);
+      mem_ptr->scalar_mul_buffer, compute_bsks, ksks,
+      mem_ptr->params.message_modulus, num_scalars);

-  host_logical_scalar_shift_inplace<Torus>(streams, computation_buffer, shift,
-                                           mem_ptr->logical_scalar_shift_buffer,
-                                           bsks, ksks, num_blocks_intermediate);
+  host_logical_scalar_shift_inplace<Torus>(
+      streams, computation_buffer, shift, mem_ptr->logical_scalar_shift_buffer,
+      compute_bsks, ksks, num_blocks_intermediate);

  uint32_t num_blocks_output = radix_lwe_out->num_radix_blocks;
  uint32_t blocks_to_copy =
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cu
@@ -5,7 +5,7 @@ uint64_t scratch_cuda_rerand_64_async(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, bool allocate_gpu_memory) {
+    uint32_t carry_modulus, bool allocate_gpu_memory, RERAND_MODE rerand_type) {
  PUSH_RANGE("scratch rerand")
  int_radix_params params(PBS_TYPE::CLASSICAL, 0, 0, big_lwe_dimension,
                          small_lwe_dimension, ks_level, ks_base_log, 0, 0, 0,
@@ -13,8 +13,9 @@ uint64_t scratch_cuda_rerand_64_async(
                          PBS_MS_REDUCTION_T::NO_REDUCTION);

  uint64_t ret = scratch_cuda_rerand<uint64_t>(
-      CudaStreams(streams), (int_rerand_mem<uint64_t> **)mem_ptr,
-      lwe_ciphertext_count, params, allocate_gpu_memory);
+      CudaStreams(streams),
+      reinterpret_cast<int_rerand_mem<uint64_t> **>(mem_ptr),
+      lwe_ciphertext_count, params, allocate_gpu_memory, rerand_type);
  POP_RANGE()
  return ret;
 }
@@ -28,7 +29,7 @@ void cuda_rerand_64_async(
    CudaStreamsFFI streams, void *lwe_array,
    const void *lwe_flattened_encryptions_of_zero_compact_array_in,
    int8_t *mem_ptr, void *const *ksk) {
-
+  PUSH_RANGE("rerand")
  auto rerand_buffer = reinterpret_cast<int_rerand_mem<uint64_t> *>(mem_ptr);

  switch (rerand_buffer->params.big_lwe_dimension) {
@@ -37,49 +38,49 @@ void cuda_rerand_64_async(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  case 512:
    host_rerand_inplace<uint64_t, AmortizedDegree<512>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  case 1024:
    host_rerand_inplace<uint64_t, AmortizedDegree<1024>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  case 2048:
    host_rerand_inplace<uint64_t, AmortizedDegree<2048>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  case 4096:
    host_rerand_inplace<uint64_t, AmortizedDegree<4096>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  case 8192:
    host_rerand_inplace<uint64_t, AmortizedDegree<8192>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  case 16384:
    host_rerand_inplace<uint64_t, AmortizedDegree<16384>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
-        (uint64_t **)(ksk), rerand_buffer);
+        reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
    break;
  default:
    PANIC("CUDA error: lwe_dimension not supported."
@@ -87,12 +88,12 @@ void cuda_rerand_64_async(
          " in the interval [256..16384].");
    break;
  }
+  POP_RANGE()
 }

 void cleanup_cuda_rerand_64(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup rerand")
-  int_rerand_mem<uint64_t> *mem_ptr =
-      (int_rerand_mem<uint64_t> *)(*mem_ptr_void);
+  auto *mem_ptr = reinterpret_cast<int_rerand_mem<uint64_t> *>(*mem_ptr_void);
  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
@@ -14,26 +14,25 @@ void host_rerand_inplace(
    CudaStreams const streams, Torus *lwe_array,
    const Torus *lwe_flattened_encryptions_of_zero_compact_array_in,
    Torus *const *ksk, int_rerand_mem<Torus> *mem_ptr) {
-  auto zero_lwes = mem_ptr->tmp_zero_lwes;
+  auto rerand_mode = mem_ptr->rerand_mode;
+  auto expanded_zero_lwes = mem_ptr->tmp_expanded_zero_lwes;
  auto num_lwes = mem_ptr->num_lwes;
-  auto ksed_zero_lwes = mem_ptr->tmp_ksed_zero_lwes;
-  auto lwe_trivial_indexes = mem_ptr->lwe_trivial_indexes;
-  auto ksk_params = mem_ptr->params;
-  auto output_dimension = ksk_params.small_lwe_dimension;
-  auto input_dimension = ksk_params.big_lwe_dimension;
-  auto ks_level = ksk_params.ks_level;
-  auto ks_base_log = ksk_params.ks_base_log;
-  auto message_modulus = ksk_params.message_modulus;
-  auto carry_modulus = ksk_params.carry_modulus;

-  GPU_ASSERT(sizeof(Torus) == 8,
-             "Cuda error: expand is only supported on 64 bits");
+  auto rerand_params = mem_ptr->params;
+  auto message_modulus = rerand_params.message_modulus;
+  auto carry_modulus = rerand_params.carry_modulus;
+  auto input_dimension = rerand_params.big_lwe_dimension;
+  // Default to input dimension; overridden to small_lwe_dimension in the KS
+  // path
+  auto output_dimension = input_dimension;
+
+  static_assert(sizeof(Torus) == 8, "expand is only supported on 64 bits");

  // Expand encryptions of zero
  // Wraps the input into a flattened_compact_lwe_lists type
  auto compact_lwe_lists = flattened_compact_lwe_lists<Torus>(
      const_cast<Torus *>(lwe_flattened_encryptions_of_zero_compact_array_in),
-      &num_lwes, (uint32_t)1, input_dimension);
+      &num_lwes, static_cast<uint32_t>(1), input_dimension);
  auto h_expand_jobs = mem_ptr->h_expand_jobs;
  auto d_expand_jobs = mem_ptr->d_expand_jobs;

@@ -53,20 +52,30 @@ void host_rerand_inplace(
      streams.stream(0), streams.gpu_index(0), true);

  host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
-                                 zero_lwes, d_expand_jobs, num_lwes);
+                                 expanded_zero_lwes, d_expand_jobs, num_lwes);

-  // Keyswitch
-  execute_keyswitch_async<Torus>(
-      streams.get_ith(0), ksed_zero_lwes, lwe_trivial_indexes, zero_lwes,
-      lwe_trivial_indexes, ksk, input_dimension, output_dimension, ks_base_log,
-      ks_level, num_lwes, true, mem_ptr->ks_tmp_buf_vec);
+  auto lwes_to_be_added = expanded_zero_lwes;
+  if (rerand_mode == RERAND_MODE::RERAND_WITH_KS) {
+    lwes_to_be_added = mem_ptr->tmp_ksed_expanded_zero_lwes;
+    output_dimension = rerand_params.small_lwe_dimension;
+    auto ks_level = rerand_params.ks_level;
+    auto ks_base_log = rerand_params.ks_base_log;
+    auto lwe_trivial_indexes = mem_ptr->lwe_trivial_indexes;
+
+    // Keyswitch
+    execute_keyswitch_async<Torus>(streams.get_ith(0), lwes_to_be_added,
+                                   lwe_trivial_indexes, expanded_zero_lwes,
+                                   lwe_trivial_indexes, ksk, input_dimension,
+                                   output_dimension, ks_base_log, ks_level,
+                                   num_lwes, true, mem_ptr->ks_tmp_buf_vec);
+  }

  // Add ks output to ct
  // Check sizes
  CudaRadixCiphertextFFI lwes_ffi;
  into_radix_ciphertext(&lwes_ffi, lwe_array, num_lwes, output_dimension);
  CudaRadixCiphertextFFI ksed_zero_lwes_ffi;
-  into_radix_ciphertext(&ksed_zero_lwes_ffi, ksed_zero_lwes, num_lwes,
+  into_radix_ciphertext(&ksed_zero_lwes_ffi, lwes_to_be_added, num_lwes,
                        output_dimension);
  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &lwes_ffi,
                       &lwes_ffi, &ksed_zero_lwes_ffi, num_lwes,
@@ -81,10 +90,11 @@ __host__ uint64_t scratch_cuda_rerand(CudaStreams streams,
                                      int_rerand_mem<Torus> **mem_ptr,
                                      uint32_t num_lwes,
                                      int_radix_params params,
-                                      bool allocate_gpu_memory) {
+                                      bool allocate_gpu_memory,
+                                      RERAND_MODE rerand_mode) {

  uint64_t size_tracker = 0;
-  *mem_ptr = new int_rerand_mem<Torus>(streams, params, num_lwes,
+  *mem_ptr = new int_rerand_mem<Torus>(streams, params, num_lwes, rerand_mode,
                                       allocate_gpu_memory, size_tracker);
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -1,13 +1,13 @@
 #include "integer/scalar_bitops.cuh"

-void cuda_integer_scalar_bitop_64_async(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
-    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
-
+void cuda_integer_scalar_bitop_inplace_64_async(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
+    void const *clear_blocks, void const *h_clear_blocks,
+    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks) {
+  // In-place variant: lwe_array_inout op= scalar, no aliasing check needed
  host_scalar_bitop<uint64_t>(
-      CudaStreams(streams), lwe_array_out, lwe_array_input,
+      CudaStreams(streams), lwe_array_inout, lwe_array_inout,
      static_cast<const uint64_t *>(clear_blocks),
      static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -36,6 +36,9 @@ void cuda_integer_scalar_comparison_64_async(
    CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
    void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks, uint32_t num_scalar_blocks) {
+  PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
+                 "Output and input pointers must be different for out-of-place "
+                 "operations");

  // The output ciphertext might be a boolean block or a radix ciphertext
  // depending on the case (eq/gt vs max/min) so the amount of blocks to
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -1,19 +1,13 @@
 #include "scalar_div.cuh"

 uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -45,19 +39,13 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_signed_scalar_div_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_signed_scalar_div_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -89,20 +77,14 @@ void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
 }

 uint64_t scratch_cuda_integer_unsigned_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -118,6 +100,9 @@ void cuda_integer_unsigned_scalar_div_rem_radix_64_async(
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    void const *clear_blocks, void const *h_clear_blocks,
    uint32_t num_clear_blocks) {
+  PANIC_IF_FALSE(quotient_ct != remainder_ct,
+                 "Quotient and remainder pointers must be different for "
+                 "out-of-place operations");

  host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), quotient_ct, remainder_ct,
@@ -140,20 +125,14 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
 }

 uint64_t scratch_cuda_integer_signed_scalar_div_rem_radix_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), params,
@@ -168,6 +147,9 @@ void cuda_integer_signed_scalar_div_rem_radix_64_async(
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    uint32_t numerator_bits) {
+  PANIC_IF_FALSE(quotient_ct != remainder_ct,
+                 "Quotient and remainder pointers must be different for "
+                 "out-of-place operations");

  host_integer_signed_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), quotient_ct, remainder_ct,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -1,18 +1,13 @@
 #include "integer/scalar_mul.cuh"

 uint64_t scratch_cuda_integer_scalar_mul_64_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_scalar_mul<uint64_t>(
      CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -1,18 +1,13 @@
 #include "scalar_rotate.cuh"

 uint64_t scratch_cuda_scalar_rotate_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_scalar_rotate<uint64_t>(
      CudaStreams(streams),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -1,18 +1,13 @@
 #include "scalar_shifts.cuh"

 uint64_t scratch_cuda_logical_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_logical_scalar_shift<uint64_t>(
      CudaStreams(streams),
@@ -35,18 +30,13 @@ void cuda_logical_scalar_shift_64_inplace_async(
 }

 uint64_t scratch_cuda_arithmetic_scalar_shift_64_inplace_async(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    CudaLweBootstrapKeyParamsFFI bsk_params, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
+  int_radix_params params(bsk_params, ks_level, ks_base_log, message_modulus,
+                          carry_modulus, noise_reduction_type);

  return scratch_cuda_arithmetic_scalar_shift<uint64_t>(
      CudaStreams(streams),
--- a/Show More
+++ b/Show More