fix(hpu): Correctly select adder configuration in ERC_20/ERC_20_SIMD

Add knobs to select ripple or kogge adder in ERC_20/ERC_20_SIMD. Previously, it was hardcoded to ripple carry and thus degraded latency performance of ERC_20.
feat: Add IfThenZero impl for Cpu
2026-01-11 07:38:08 -05:00 · 2025-12-24 10:38:38 +01:00 · 2025-12-24 10:38:38 +01:00 · 2025-12-24 10:38:38 +01:00 · 2025-12-24 10:38:38 +01:00 · 2025-12-23 11:51:00 +01:00
95 changed files with 474 additions and 314 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -66,7 +66,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'true' # Needed to pull lfs data
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -63,7 +63,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -171,7 +171,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -50,7 +50,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -112,7 +112,7 @@ jobs:
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -60,7 +60,7 @@ jobs:
    timeout-minutes: 1440
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -51,7 +51,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -112,7 +112,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -72,7 +72,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -182,7 +182,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -64,7 +64,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -149,7 +149,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -49,7 +49,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -8,8 +8,13 @@ on:
        description: "Run CPU benchmarks"
        type: boolean
        default: true
-      run-gpu-benchmarks:
-        description: "Run GPU benchmarks"
+      # GPU benchmarks are split because of resource scarcity.
+      run-gpu-integer-benchmarks:
+        description: "Run GPU integer benchmarks"
+        type: boolean
+        default: true
+      run-gpu-core-crypto-benchmarks:
+        description: "Run GPU core-crypto benchmarks"
        type: boolean
        default: true
      run-hpu-benchmarks:
@@ -52,7 +57,7 @@ jobs:
  run-benchmarks-gpu-integer:
    name: benchmark_documentation/run-benchmarks-gpu-integer
    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-benchmarks
+    if: inputs.run-gpu-integer-benchmarks
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
@@ -113,7 +118,7 @@ jobs:
  run-benchmarks-gpu-core-crypto:
    name: benchmark_documentation/run-benchmarks-gpu-core-crypto
    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-benchmarks
+    if: inputs.run-gpu-core-crypto-benchmarks
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
@@ -133,7 +138,7 @@ jobs:
  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks ||inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
@@ -143,7 +148,7 @@ jobs:
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -152,7 +157,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
@@ -175,7 +180,7 @@ jobs:
      PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'

--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -123,7 +123,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -175,7 +175,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -130,7 +130,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -141,7 +141,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -223,7 +223,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
        with:
          node-version: 20.x

@@ -299,7 +299,7 @@ jobs:
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -126,7 +126,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -50,7 +50,7 @@ jobs:
      pull-requests: write # Needed to write a comment in a pull-request
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0  # Needed to get commit hash
          persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -305,7 +305,7 @@ jobs:
      REF_NAME: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -55,7 +55,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +102,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -55,7 +55,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +102,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,7 +39,7 @@ jobs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -91,7 +91,7 @@ jobs:
        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -175,7 +175,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -26,7 +26,7 @@ jobs:
    name: cargo_audit/audit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -24,7 +24,7 @@ jobs:
    outputs:
      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -140,7 +140,7 @@ jobs:
      result: ${{ steps.set_builds_result.outputs.result }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -26,7 +26,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -24,7 +24,7 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -29,7 +29,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -56,7 +56,7 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +92,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -31,7 +31,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -87,7 +87,7 @@ jobs:
        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -50,7 +50,7 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -62,7 +62,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -43,7 +43,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'

@@ -81,7 +81,7 @@ jobs:
          python3 -m pip install -r ci/data_extractor/requirements.txt
          python3 ci/data_extractor/src/data_extractor.py "${OUTPUT_FILENAME}" \
          --generate-svg \
-          --backend-comparison\
+          --backends-comparison \
          --time-span-days "${TIME_SPAN}"
        env:
          OUTPUT_FILENAME: ${{ inputs.output_filename }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -41,7 +41,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -23,8 +23,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # every 3 months
-    - cron: "0 0 1 */3 *"
+    # every month
+    - cron: "0 0 1 * *"

 permissions:
  contents: read
@@ -50,7 +50,7 @@ jobs:
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: gpu-test
+          profile: single-h100

      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
@@ -79,7 +79,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -39,7 +39,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -114,7 +114,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -68,7 +68,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -65,7 +65,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -74,7 +74,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -32,7 +32,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -83,7 +83,7 @@ jobs:
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -53,7 +53,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -41,7 +41,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -52,7 +52,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -93,7 +93,7 @@ jobs:
      id-token: write # Needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: 'false'
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -64,7 +64,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: "false"
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -68,7 +68,7 @@ jobs:
      id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
    steps:
      - name: Checkout
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -85,7 +85,7 @@ jobs:
          make build_web_js_api_parallel

      - name: Authenticate on NPM
-        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
        with:
          node-version: '24'
          registry-url: 'https://registry.npmjs.org'
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -60,7 +60,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          toolchain: stable

      - name: Checkout lattice-estimator
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -17,7 +17,7 @@ jobs:
      issues: read # Needed to fetch all issues
      pull-requests: write # Needed to write message and close the PR
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
        with:
          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
          days-before-stale: 2
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -35,7 +35,8 @@ template <typename Torus> struct int_aes_lut_buffers {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, and_lambda, allocate_gpu_memory);
    auto active_streams_and_lut = streams.active_gpu_subset(
-        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
+        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
+        params.pbs_type);
    this->and_lut->broadcast_lut(active_streams_and_lut);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

@@ -50,8 +51,8 @@ template <typename Torus> struct int_aes_lut_buffers {
        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, flush_lambda, allocate_gpu_memory);
-    auto active_streams_flush_lut =
-        streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
+    auto active_streams_flush_lut = streams.active_gpu_subset(
+        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
    this->flush_lut->broadcast_lut(active_streams_flush_lut);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

@@ -65,7 +66,8 @@ template <typename Torus> struct int_aes_lut_buffers {
        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, carry_lambda, allocate_gpu_memory);
-    auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
+    auto active_streams_carry_lut =
+        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
    this->carry_lut->broadcast_lut(active_streams_carry_lut);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,8 @@

 extern std::mutex m;
 extern bool p2p_enabled;
-extern const int THRESHOLD_MULTI_GPU;
+extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

 extern "C" {
 int32_t cuda_setup_multi_gpu(int device_0_id);
@@ -39,7 +40,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
  }
 }

-uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
+                              PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -73,9 +75,10 @@ public:

  // Returns a subset of this set as an active subset. An active subset is one
  // that is temporarily used to perform some computation
-  CudaStreams active_gpu_subset(int num_radix_blocks) {
-    return CudaStreams(_streams, _gpu_indexes,
-                       get_active_gpu_count(num_radix_blocks, _gpu_count));
+  CudaStreams active_gpu_subset(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }

  // Returns a CudaStreams struct containing only the ith stream
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -20,7 +20,8 @@ template <typename Torus> struct boolean_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-    auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+    auto active_streams =
+        streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
    this->unchecked = is_unchecked;
    switch (op) {
    case BITAND:
@@ -119,7 +120,8 @@ template <typename Torus> struct int_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    switch (op) {
    case BITAND:
    case BITOR:
@@ -216,7 +218,8 @@ template <typename Torus> struct boolean_bitnot_buffer {
          message_extract_lut->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+      auto active_streams =
+          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
      message_extract_lut->broadcast_lut(active_streams);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -39,7 +39,8 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
          },
          allocate_gpu_memory);

-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      lut->broadcast_lut(active_streams);

      this->last_block = new CudaRadixCiphertextFFI;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -14,7 +14,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
                         uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    tmp = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -114,9 +115,11 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams_pred = streams.active_gpu_subset(2 * num_radix_blocks);
+    auto active_streams_pred =
+        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
    predicate_lut->broadcast_lut(active_streams_pred);
-    auto active_streams_msg = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams_msg =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    message_extract_lut->broadcast_lut(active_streams_msg);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -52,7 +52,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

-    auto active_streams = streams.active_gpu_subset(max_chunks);
+    auto active_streams =
+        streams.active_gpu_subset(max_chunks, params.pbs_type);
    is_max_value->broadcast_lut(active_streams);
  }

@@ -108,7 +109,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);

-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    is_non_zero_lut->broadcast_lut(active_streams);

    // Scalar may have up to num_radix_blocks blocks
@@ -238,7 +240,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        block_selector_f, gpu_memory_allocated);
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    tree_inner_leaf_lut->broadcast_lut(active_streams);
  }

@@ -390,7 +393,8 @@ template <typename Torus> struct int_comparison_buffer {
    this->op = op;
    this->is_signed = is_signed;

-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    identity_lut_f = [](Torus x) -> Torus { return x; };

@@ -523,7 +527,7 @@ template <typename Torus> struct int_comparison_buffer {
          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      signed_lut->broadcast_lut(active_streams);
    }
    preallocated_h_lut = (Torus *)malloc(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -116,7 +116,8 @@ template <typename Torus> struct int_decompression {
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
          decompression_rescale_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
+      auto active_streams = streams.active_gpu_subset(
+          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
      decompression_rescale_lut->broadcast_lut(active_streams);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -356,7 +356,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_blocks, params.pbs_type);
      luts[j]->broadcast_lut(active_streams);
    }
  }
@@ -1012,7 +1013,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_masking, gpu_memory_allocated);
-      auto active_streams_1 = streams.active_gpu_subset(1);
+      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
      masking_luts_1[i]->broadcast_lut(active_streams_1);

      generate_device_accumulator<Torus>(
@@ -1021,7 +1022,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_masking, gpu_memory_allocated);
-      auto active_streams_2 = streams.active_gpu_subset(num_blocks);
+      auto active_streams_2 =
+          streams.active_gpu_subset(num_blocks, params.pbs_type);
      masking_luts_2[i]->broadcast_lut(active_streams_2);
    }

@@ -1040,7 +1042,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
-    auto active_streams = streams.active_gpu_subset(num_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
@@ -1128,7 +1131,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
-    auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
+    auto active_gpu_count_for_bits =
+        streams.active_gpu_subset(1, params.pbs_type);
    for (int i = 0; i < num_bits_in_message; i++) {
      auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
        return (x == 0 && y == 0) << i;
@@ -1152,7 +1156,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint32_t num_blocks, bool allocate_gpu_memory,
                              uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
-    auto active_streams = streams.active_gpu_subset(2 * num_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(2 * num_blocks, params.pbs_type);
    this->params = params;

    if (params.message_modulus == 4 && params.carry_modulus == 4 &&
@@ -1473,7 +1478,8 @@ template <typename Torus> struct int_div_rem_memory {
                     bool allocate_gpu_memory, uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
    this->params = params;
    this->is_signed = is_signed;

@@ -1559,7 +1565,7 @@ template <typename Torus> struct int_div_rem_memory {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
-          streams.active_gpu_subset(1); // only 1 block needed
+          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -20,7 +20,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
    this->allocate_gpu_memory = allocate_gpu_memory;
    this->direction = direction;
    this->bit_value = bit_value;
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    this->univ_lut_mem =
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);
@@ -246,7 +247,8 @@ template <typename Torus> struct int_ilog2_buffer {
                                params.glwe_dimension, params.polynomial_size,
                                params.message_modulus, params.carry_modulus,
                                lut_message_lambda, allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(counter_num_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
    lut_message_not->broadcast_lut(active_streams);

    this->lut_carry_not =
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -371,7 +371,8 @@ struct int_radix_lut_custom_input_output {
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

-    this->active_streams = streams.active_gpu_subset(num_radix_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  }

  void setup_degrees() {
@@ -382,14 +383,18 @@ struct int_radix_lut_custom_input_output {

  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
+
+    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
+                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu =
-          std::min((int)num_radix_blocks,
-                   std::max(THRESHOLD_MULTI_GPU,
-                            get_num_inputs_on_gpu(num_radix_blocks, i,
-                                                  active_streams.count())));
+      auto num_blocks_on_gpu = std::min(
+          (int)num_radix_blocks,
+          std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
+                                                    active_streams.count())));

      uint64_t size = 0;
      execute_scratch_pbs<OutputTorus>(
@@ -424,18 +429,22 @@ struct int_radix_lut_custom_input_output {
    /// back to the original indexing
    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
                              num_radix_blocks, params.big_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
+                              size_tracker, params.pbs_type,
+                              allocate_gpu_memory);
    multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
                              num_radix_blocks, params.small_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
+                              size_tracker, params.pbs_type,
+                              allocate_gpu_memory);
    if (num_many_lut > 1) {
      multi_gpu_alloc_lwe_many_lut_output_async(
          active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
-          params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
+          params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
+          allocate_gpu_memory);
    } else {
      multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
                                num_radix_blocks, params.big_lwe_dimension + 1,
-                                size_tracker, allocate_gpu_memory);
+                                size_tracker, params.pbs_type,
+                                allocate_gpu_memory);
    }
    multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
                                num_radix_blocks, size_tracker,
@@ -451,12 +460,14 @@ struct int_radix_lut_custom_input_output {
  }

  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
+    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
+                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

-    auto inputs_on_gpu =
-        std::min((int)num_input_blocks,
-                 std::max(THRESHOLD_MULTI_GPU,
-                          get_num_inputs_on_gpu(num_input_blocks, 0,
-                                                active_streams.count())));
+    auto inputs_on_gpu = std::min(
+        (int)num_input_blocks,
+        std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
+                                                  active_streams.count())));

    if (inputs_on_gpu >= get_threshold_ks_gemm()) {
      for (auto i = 0; i < active_streams.count(); ++i) {
@@ -798,16 +809,20 @@ struct int_radix_lut_custom_input_output {
  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {
+
+    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
+                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
      lwe_aligned_vec.resize(active_streams.count());
      for (uint i = 0; i < active_streams.count(); i++) {
        uint64_t size_tracker_on_array_i = 0;
-        auto inputs_on_gpu =
-            std::min((int)max_num_radix_blocks,
-                     std::max(THRESHOLD_MULTI_GPU,
-                              get_num_inputs_on_gpu(max_num_radix_blocks, i,
-                                                    active_streams.count())));
+        auto inputs_on_gpu = std::min(
+            (int)max_num_radix_blocks,
+            std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
+                                                      active_streams.count())));
        InputTorus *d_array =
            (InputTorus *)cuda_malloc_with_size_tracking_async(
                inputs_on_gpu * (params.big_lwe_dimension + 1) *
@@ -998,8 +1013,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
        num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);

-    auto active_streams =
-        streams.active_gpu_subset(bits_per_block * num_radix_blocks);
+    auto active_streams = streams.active_gpu_subset(
+        bits_per_block * num_radix_blocks, params.pbs_type);
    lut->broadcast_lut(active_streams);

    /**
@@ -1266,7 +1281,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
          luts_message_carry->get_max_degree(1), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
          lut_f_carry, gpu_memory_allocated);
-      auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
+      auto active_gpu_count_mc =
+          streams.active_gpu_subset(pbs_count, params.pbs_type);
      luts_message_carry->broadcast_lut(active_gpu_count_mc);
    }
  }
@@ -1436,7 +1452,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(num_seq_luts);
+    auto active_streams =
+        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
    lut_sequential_algorithm->broadcast_lut(active_streams);
    free(h_seq_lut_indexes);
  };
@@ -1490,7 +1507,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_lut_hillis_steele,
        gpu_memory_allocated);
-    auto active_streams = streams.active_gpu_subset(num_groups);
+    auto active_streams =
+        streams.active_gpu_subset(num_groups, params.pbs_type);
    lut_hillis_steele->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {
@@ -1667,7 +1685,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_first_step->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {
@@ -1932,7 +1951,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_second_step->broadcast_lut(active_streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -1957,7 +1977,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
+    auto new_active_streams = streams.active_gpu_subset(
+        new_num_blocks, luts_array_second_step->params.pbs_type);
    // We just need to update the lut indexes so we use false here
    luts_array_second_step->broadcast_lut(new_active_streams, false);

@@ -2124,7 +2145,7 @@ template <typename Torus> struct int_sc_prop_memory {
          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
          gpu_memory_allocated);

-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      lut_overflow_flag_prep->broadcast_lut(active_streams);
    }

@@ -2196,7 +2217,8 @@ template <typename Torus> struct int_sc_prop_memory {
          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
          streams.gpu_index(0), allocate_gpu_memory);
    }
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks + 1);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
    lut_message_extract->broadcast_lut(active_streams);
  };

@@ -2393,7 +2415,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_first_step->broadcast_lut(active_streams);
  };

@@ -2404,7 +2427,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
+    auto new_active_streams = streams.active_gpu_subset(
+        new_num_blocks, luts_array_first_step->params.pbs_type);
    // We just need to update the lut indexes so we use false here
    luts_array_first_step->broadcast_lut(new_active_streams, false);
  }
@@ -2499,7 +2523,8 @@ template <typename Torus> struct int_borrow_prop_memory {
        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_message_extract,
        gpu_memory_allocated);
-    active_streams = streams.active_gpu_subset(num_radix_blocks);
+    active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    lut_message_extract->broadcast_lut(active_streams);

@@ -2520,7 +2545,8 @@ template <typename Torus> struct int_borrow_prop_memory {
      lut_borrow_flag->broadcast_lut(active_streams);
    }

-    active_streams = streams.active_gpu_subset(num_radix_blocks);
+    active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
                                                               2);
  };
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -45,7 +45,8 @@ template <typename Torus> struct int_mul_memory {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          zero_out_predicate_lut_f, gpu_memory_allocated);

-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      zero_out_predicate_lut->broadcast_lut(active_streams);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
@@ -122,7 +123,8 @@ template <typename Torus> struct int_mul_memory {
          streams.stream(0), streams.gpu_index(0),
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
-    auto active_streams = streams.active_gpu_subset(total_block_count);
+    auto active_streams =
+        streams.active_gpu_subset(total_block_count, params.pbs_type);
    luts_array->broadcast_lut(active_streams);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -126,7 +126,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
        luts->get_lut_indexes(0, 0), this->h_lut_indexes,
        num_blocks_to_process * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
+    auto active_streams =
+        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
    luts->broadcast_lut(active_streams);

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -91,7 +91,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          shift_lut_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
@@ -177,7 +178,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          shift_lut_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
@@ -220,7 +222,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                     uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;

-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    // In the arithmetic shift, a PBS has to be applied to the last rotated
    // block twice: once to shift it, once to compute the padding block to be
    // copied onto all blocks to the left of the last rotated block
@@ -276,7 +278,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          shift_last_block_lut_univariate->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
-      auto active_streams_shift_last = streams.active_gpu_subset(1);
+      auto active_streams_shift_last =
+          streams.active_gpu_subset(1, params.pbs_type);
      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
@@ -302,7 +305,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        padding_block_lut_f, gpu_memory_allocated);
-    // auto active_streams = streams.active_gpu_subset(1);
+    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    padding_block_lut_univariate->broadcast_lut(active_streams);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -344,7 +347,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
-          streams.active_gpu_subset(num_radix_blocks);
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -119,8 +119,8 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
-    auto active_gpu_count_mux =
-        streams.active_gpu_subset(bits_per_block * num_radix_blocks);
+    auto active_gpu_count_mux = streams.active_gpu_subset(
+        bits_per_block * num_radix_blocks, params.pbs_type);
    mux_lut->broadcast_lut(active_gpu_count_mux);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
@@ -132,7 +132,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
    auto active_gpu_count_cleaning =
-        streams.active_gpu_subset(num_radix_blocks);
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -108,7 +108,8 @@ template <typename Torus> struct int_overflowing_sub_memory {
        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        f_message_acc, gpu_memory_allocated);

-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array->broadcast_lut(active_streams);
    luts_borrow_propagation_sum->broadcast_lut(active_streams);
    message_acc->broadcast_lut(active_streams);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_comparison.h
@@ -38,7 +38,8 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    uint32_t num_gpus = active_streams.count();

--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -40,7 +40,8 @@ template <typename Torus> struct int_equality_selectors_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -154,7 +155,8 @@ template <typename Torus> struct int_possible_results_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -207,7 +209,8 @@ template <typename Torus> struct int_possible_results_buffer {
            params.message_modulus, params.carry_modulus, fns,
            allocate_gpu_memory);

-        current_lut->broadcast_lut(streams.active_gpu_subset(1));
+        current_lut->broadcast_lut(
+            streams.active_gpu_subset(1, params.pbs_type));
        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
      }
@@ -282,7 +285,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams);
@@ -300,7 +304,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          id_fn, allocate_gpu_memory);

-      lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
+      lut->broadcast_lut(
+          streams.active_gpu_subset(num_blocks, params.pbs_type));
      this->stream_identity_luts[i] = lut;
    }

@@ -321,7 +326,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        msg_fn, allocate_gpu_memory);
    this->message_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks));
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -333,7 +338,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        carry_fn, allocate_gpu_memory);
    this->carry_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks));
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -628,7 +633,8 @@ template <typename Torus> struct int_unchecked_contains_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -703,7 +709,8 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1094,7 +1101,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1184,7 +1192,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1200,7 +1209,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
@@ -1292,7 +1302,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1372,7 +1383,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1388,7 +1400,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
@@ -1462,7 +1475,8 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1523,7 +1537,8 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -47,7 +47,7 @@ template <typename Torus> struct compact_lwe_list {

 template <typename Torus> struct flattened_compact_lwe_lists {
  Torus *d_ptr;
-  Torus **d_ptr_to_compact_list;
+  Torus **ptr_array_to_d_compact_list;
  const uint32_t *h_num_lwes_per_compact_list;
  uint32_t num_compact_lists;
  uint32_t lwe_dimension;
@@ -59,13 +59,13 @@ template <typename Torus> struct flattened_compact_lwe_lists {
                              uint32_t lwe_dimension)
      : d_ptr(d_ptr), h_num_lwes_per_compact_list(h_num_lwes_per_compact_list),
        num_compact_lists(num_compact_lists), lwe_dimension(lwe_dimension) {
-    d_ptr_to_compact_list =
-        static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus **)));
+    ptr_array_to_d_compact_list =
+        static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus *)));
    total_num_lwes = 0;
    auto curr_list = d_ptr;
    for (auto i = 0; i < num_compact_lists; ++i) {
      total_num_lwes += h_num_lwes_per_compact_list[i];
-      d_ptr_to_compact_list[i] = curr_list;
+      ptr_array_to_d_compact_list[i] = curr_list;
      curr_list += lwe_dimension + h_num_lwes_per_compact_list[i];
    }
  }
@@ -75,10 +75,12 @@ template <typename Torus> struct flattened_compact_lwe_lists {
      PANIC("index out of range in flattened_compact_lwe_lists::get");
    }

-    return compact_lwe_list(d_ptr_to_compact_list[compact_list_index],
+    return compact_lwe_list(ptr_array_to_d_compact_list[compact_list_index],
                            lwe_dimension,
                            h_num_lwes_per_compact_list[compact_list_index]);
  }
+
+  void release() { free(ptr_array_to_d_compact_list); }
 };

 /*
@@ -121,7 +123,6 @@ template <typename Torus> struct zk_expand_mem {
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
        casting_key_type(casting_key_type) {
-
    gpu_memory_allocated = allocate_gpu_memory;

    // We copy num_lwes_per_compact_list so we get protection against
@@ -289,7 +290,8 @@ template <typename Torus> struct zk_expand_mem {
        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

-    auto active_streams = streams.active_gpu_subset(2 * num_lwes);
+    auto active_streams =
+        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
    message_and_carry_extract_luts->broadcast_lut(active_streams);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
@@ -313,7 +315,6 @@ template <typename Torus> struct zk_expand_mem {
  }

  void release(CudaStreams streams) {
-
    message_and_carry_extract_luts->release(streams);
    delete message_and_carry_extract_luts;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -153,7 +153,8 @@ __host__ void are_all_comparisons_block_true(
        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
                                 h_lut_indexes, num_chunks * sizeof(Torus),
                                 streams.stream(0), streams.gpu_index(0));
-        auto active_streams = streams.active_gpu_subset(num_chunks);
+        auto active_streams =
+            streams.active_gpu_subset(num_chunks, params.pbs_type);
        is_max_value_lut->broadcast_lut(active_streams);
      }
      lut = is_max_value_lut;
@@ -172,8 +173,8 @@ __host__ void are_all_comparisons_block_true(
                               is_max_value_lut->h_lut_indexes,
                               is_max_value_lut->num_blocks * sizeof(Torus),
                               streams.stream(0), streams.gpu_index(0));
-      auto active_gpu_count_is_max =
-          streams.active_gpu_subset(is_max_value_lut->num_blocks);
+      auto active_gpu_count_is_max = streams.active_gpu_subset(
+          is_max_value_lut->num_blocks, params.pbs_type);
      is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);

      reset_radix_ciphertext_blocks(lwe_array_out, 1);
@@ -488,7 +489,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
      polynomial_size, message_modulus, carry_modulus, f, true,
      tree_buffer->preallocated_h_lut);

-  auto active_streams = streams.active_gpu_subset(1);
+  auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
  last_lut->broadcast_lut(active_streams);

  // Last leaf
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -339,7 +339,9 @@ host_integer_decompress(CudaStreams streams,
    /// dimension to a big LWE dimension
    auto encryption_params = h_mem_ptr->encryption_params;
    auto lut = h_mem_ptr->decompression_rescale_lut;
-    auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
+    auto active_streams = streams.active_gpu_subset(
+        num_blocks_to_decompress,
+        h_mem_ptr->decompression_rescale_lut->params.pbs_type);
    if (active_streams.count() == 1) {
      execute_pbs_async<Torus, Torus>(
          active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -542,7 +542,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -645,7 +646,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -764,7 +766,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -1812,7 +1815,8 @@ uint64_t scratch_cuda_apply_univariate_lut(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  (*mem_ptr)->broadcast_lut(active_streams);
  POP_RANGE()
  return size_tracker;
@@ -1847,7 +1851,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  (*mem_ptr)->broadcast_lut(active_streams);
  POP_RANGE()
  return size_tracker;
@@ -1883,7 +1888,8 @@ uint64_t scratch_cuda_apply_bivariate_lut(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  (*mem_ptr)->broadcast_lut(active_streams);
  POP_RANGE()
  return size_tracker;
@@ -2336,8 +2342,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,

  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
  // from the output ct
-  auto active_streams =
-      streams.active_gpu_subset(lwe_array_out->num_radix_blocks);
+  auto active_streams = streams.active_gpu_subset(
+      lwe_array_out->num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -388,7 +388,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
    current_columns.next_accumulation(total_ciphertexts, total_messages,
                                      needs_processing);

-    auto active_streams = streams.active_gpu_subset(total_ciphertexts);
+    auto active_streams =
+        streams.active_gpu_subset(total_ciphertexts, mem_ptr->params.pbs_type);
    GPU_ASSERT(total_ciphertexts <= mem_ptr->luts_message_carry->num_blocks,
               "SUM CT");

@@ -442,7 +443,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
        streams.stream(0), streams.gpu_index(0), current_blocks,
        num_radix_blocks, num_radix_blocks + 1);

-    auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks);
+    auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks,
+                                                    mem_ptr->params.pbs_type);

    if (active_streams.count() == 1) {
      execute_keyswitch_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -29,7 +29,8 @@ void host_integer_grouped_oprf(CudaStreams streams,
                               int_grouped_oprf_memory<Torus> *mem_ptr,
                               void *const *bsks) {

-  auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
+  auto active_streams = streams.active_gpu_subset(num_blocks_to_process,
+                                                  mem_ptr->params.pbs_type);
  auto lut = mem_ptr->luts;

  if (active_streams.count() == 1) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cu
@@ -34,49 +34,49 @@ void cuda_rerand_64(

  switch (rerand_buffer->params.big_lwe_dimension) {
  case 256:
-    rerand_inplace<uint64_t, AmortizedDegree<256>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<256>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
        (uint64_t **)(ksk), rerand_buffer);
    break;
  case 512:
-    rerand_inplace<uint64_t, AmortizedDegree<512>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<512>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
        (uint64_t **)(ksk), rerand_buffer);
    break;
  case 1024:
-    rerand_inplace<uint64_t, AmortizedDegree<1024>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<1024>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
        (uint64_t **)(ksk), rerand_buffer);
    break;
  case 2048:
-    rerand_inplace<uint64_t, AmortizedDegree<2048>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<2048>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
        (uint64_t **)(ksk), rerand_buffer);
    break;
  case 4096:
-    rerand_inplace<uint64_t, AmortizedDegree<4096>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<4096>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
        (uint64_t **)(ksk), rerand_buffer);
    break;
  case 8192:
-    rerand_inplace<uint64_t, AmortizedDegree<8192>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<8192>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
        (uint64_t **)(ksk), rerand_buffer);
    break;
  case 16384:
-    rerand_inplace<uint64_t, AmortizedDegree<16384>>(
+    host_rerand_inplace<uint64_t, AmortizedDegree<16384>>(
        streams, static_cast<uint64_t *>(lwe_array),
        static_cast<const uint64_t *>(
            lwe_flattened_encryptions_of_zero_compact_array_in),
@@ -88,9 +88,6 @@ void cuda_rerand_64(
          " in the interval [256..16384].");
    break;
  }
-
-  cuda_synchronize_stream(static_cast<cudaStream_t>(streams.streams[0]),
-                          streams.gpu_indexes[0]);
 }

 void cleanup_cuda_rerand(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
@@ -10,7 +10,7 @@
 #include "zk/zk_utilities.h"

 template <typename Torus, class params>
-void rerand_inplace(
+void host_rerand_inplace(
    CudaStreams const streams, Torus *lwe_array,
    const Torus *lwe_flattened_encryptions_of_zero_compact_array_in,
    Torus *const *ksk, int_rerand_mem<Torus> *mem_ptr) {
@@ -73,6 +73,7 @@ void rerand_inplace(
                       message_modulus, carry_modulus);
  release_cpu_radix_ciphertext_async(&lwes_ffi);
  release_cpu_radix_ciphertext_async(&ksed_zero_lwes_ffi);
+  compact_lwe_lists.release();
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -45,7 +45,8 @@ host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                 num_clear_blocks * sizeof(Torus),
                                 streams.stream(0), streams.gpu_index(0));
-    auto active_streams = streams.active_gpu_subset(num_clear_blocks);
+    auto active_streams = streams.active_gpu_subset(
+        num_clear_blocks, mem_ptr->lut->params.pbs_type);
    lut->broadcast_lut(active_streams, false);

    integer_radix_apply_univariate_lookup_table<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -146,7 +146,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
        true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    lut->broadcast_lut(active_streams);

    integer_radix_apply_univariate_lookup_table<Torus>(
@@ -240,7 +240,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
        polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f, true,
        mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table<Torus>(
@@ -274,7 +274,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true,
          mem_ptr->preallocated_h_lut);
-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table<Torus>(
@@ -419,7 +419,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
        polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f, true,
        mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table<Torus>(
@@ -521,7 +521,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
        signed_msb_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        lut_f, true, mem_ptr->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    signed_msb_lut->broadcast_lut(active_streams);

    CudaRadixCiphertextFFI sign_block;
@@ -567,7 +567,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true,
          mem_ptr->preallocated_h_lut);
-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table<Torus>(
@@ -785,8 +785,8 @@ __host__ void host_scalar_equality_check(
          num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
          lsb_streams.gpu_index(0));
    }
-    auto active_streams =
-        lsb_streams.active_gpu_subset(num_halved_scalar_blocks);
+    auto active_streams = lsb_streams.active_gpu_subset(
+        num_halved_scalar_blocks, params.pbs_type);
    // We use false cause we only will broadcast the indexes
    scalar_comparison_luts->broadcast_lut(active_streams, false);

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        Torus *global_accumulator, uint32_t lwe_dimension,
        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
-        uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
+        uint32_t lwe_chunk_size, uint64_t keybundle_size_per_input,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
        uint32_t num_many_lut, uint32_t lut_stride) {

@@ -321,8 +321,9 @@ __host__ void execute_cg_external_product_loop(
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
      (glwe_dimension + 1) * (polynomial_size / 2);

-  uint64_t chunk_size = std::min(
-      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
+  uint32_t chunk_size = (uint32_t)(std::min(
+      lwe_chunk_size,
+      (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset));

  auto d_mem = buffer->d_mem_acc_cg;
  auto keybundle_fft = buffer->keybundle_fft;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -373,7 +373,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
        double2 *join_buffer, uint32_t glwe_dimension, uint32_t polynomial_size,
-        uint32_t level_count, uint32_t iteration, uint64_t lwe_chunk_size,
+        uint32_t level_count, uint32_t iteration, uint32_t lwe_chunk_size,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
        uint32_t num_many_lut, uint32_t lut_stride) {
  // We use shared memory for the polynomials that are used often during the
@@ -790,7 +790,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                 uint32_t lut_stride) {
  cuda_set_device(gpu_index);

-  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
          polynomial_size);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -626,7 +626,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory) {

-  if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
+  if (!cuda_check_support_thread_block_clusters())
    return false;

  uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        Torus *global_accumulator, uint32_t lwe_dimension,
        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
-        uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
+        uint32_t lwe_chunk_size, uint64_t keybundle_size_per_input,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
        bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) {

@@ -205,10 +205,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        const Torus *__restrict__ lut_vector_indexes,
        const Torus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
-        const double2 *__restrict__ keybundle_array, double2 *join_buffer,
-        Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
-        uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
-        uint32_t num_many_lut, uint32_t lut_stride) {
+        const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
+        uint32_t lwe_dimension, uint32_t lwe_offset, uint32_t lwe_chunk_size,
+        uint64_t keybundle_size_per_input, uint32_t num_many_lut,
+        uint32_t lut_stride) {

  constexpr uint32_t level_count = 1;
  constexpr uint32_t grouping_factor = 4;
@@ -548,8 +548,9 @@ __host__ void execute_tbc_external_product_loop(
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
      (glwe_dimension + 1) * (polynomial_size / 2);

-  uint64_t chunk_size = std::min(
-      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
+  uint32_t chunk_size = (uint32_t)(std::min(
+      lwe_chunk_size,
+      (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset));

  auto d_mem = buffer->d_mem_acc_tbc;
  auto keybundle_fft = buffer->keybundle_fft;
@@ -624,9 +625,9 @@ __host__ void execute_tbc_external_product_loop(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
              Torus, params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-          lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
-          global_accumulator, lwe_dimension, lwe_offset, chunk_size,
-          keybundle_size_per_input, num_many_lut, lut_stride));
+          lwe_array_in, lwe_input_indexes, keybundle_fft, global_accumulator,
+          lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input,
+          num_many_lut, lut_stride));
    } else {
      check_cuda_error(cudaLaunchKernelEx(
          &config,
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -5,7 +5,8 @@

 std::mutex m;
 bool p2p_enabled = false;
-const int THRESHOLD_MULTI_GPU = 12;
+const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
+const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;

 // Enable bidirectional p2p access between all available GPUs and device_0_id
 int32_t cuda_setup_multi_gpu(int device_0_id) {
@@ -39,10 +40,13 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
  return (int32_t)(num_used_gpus);
 }

-uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count) {
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
+                              PBS_TYPE pbs_type) {
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
  uint32_t ceil_div_inputs =
-      std::max((uint32_t)1,
-               (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
+      std::max((uint32_t)1, (num_inputs + threshold - 1) / threshold);
  uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
  return active_gpu_count;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -59,15 +59,20 @@ template <typename Torus>
 void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
                               uint32_t num_inputs, uint32_t lwe_size,
                               uint64_t &size_tracker_on_gpu_0,
-                               bool allocate_gpu_memory) {
+                               PBS_TYPE pbs_type, bool allocate_gpu_memory) {
  PANIC_IF_FALSE(dest.empty(),
                 "Cuda error: Requested multi-GPU vector is already allocated");
+
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
  dest.resize(streams.count());
  for (uint i = 0; i < streams.count(); i++) {
    uint64_t size_tracker_on_gpu_i = 0;
    auto inputs_on_gpu = std::min(
        (int)num_inputs,
-        std::max(THRESHOLD_MULTI_GPU,
+        std::max((int)threshold,
                 get_num_inputs_on_gpu(num_inputs, i, streams.count())));
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        inputs_on_gpu * lwe_size * sizeof(Torus), streams.stream(i),
@@ -81,7 +86,7 @@ void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,

 template void multi_gpu_alloc_lwe_async<__uint128_t>(
    CudaStreams streams, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
-    uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
+    uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0, PBS_TYPE pbs_type,
    bool allocate_gpu_memory);

 /// Allocates the input/output vector for all devices
@@ -91,16 +96,21 @@ template <typename Torus>
 void multi_gpu_alloc_lwe_many_lut_output_async(
    CudaStreams streams, std::vector<Torus *> &dest, uint32_t num_inputs,
    uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
-    bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  PANIC_IF_FALSE(dest.empty(),
                 "Cuda error: Requested multi-GPU vector is already allocated");
+
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
  dest.resize(streams.count());
  for (uint i = 0; i < streams.count(); i++) {
    uint64_t size_tracker = 0;
    auto inputs_on_gpu = std::min(
        (int)num_inputs,
-        std::max(THRESHOLD_MULTI_GPU,
+        std::max((int)threshold,
                 get_num_inputs_on_gpu(num_inputs, i, streams.count())));
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus),
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -100,6 +100,7 @@ __host__ void host_expand_without_verification(
      2 * num_lwes);
  release_cpu_radix_ciphertext_async(&input);
  release_cpu_radix_ciphertext_async(&output);
+  compact_lwe_lists.release();
 }

 template <typename Torus>
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -113,6 +113,7 @@ pub fn iop_add_simd(prog: &mut Program) {
        prog,
        crate::asm::iop::SIMD_N,
        fw_impl::llt::iop_add_ripple_rtl,
+        None,
    );
 }

@@ -227,14 +228,18 @@ pub fn iop_muls(prog: &mut Program) {
 pub fn iop_erc_20(prog: &mut Program) {
    // Add Comment header
    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
-    iop_erc_20_rtl(prog, 0).add_to_prog(prog);
+    // TODO: Make sweep of kogge_blk_w
+    // All these little parameters would be very handy to write an
+    // exploration/compilation program which would try to minimize latency by
+    // playing with these.
+    iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
 }

 #[instrument(level = "trace", skip(prog))]
 pub fn iop_erc_20_simd(prog: &mut Program) {
    // Add Comment header
    prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
-    simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl);
+    simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl, None);
 }

 #[instrument(level = "trace", skip(prog))]
@@ -381,7 +386,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
 ///     (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
 /// Where N is the batch size
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
+pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
    // Allocate metavariables:
    // Dest -> Operand
    let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
@@ -392,13 +397,6 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
    // Src Amount -> Operand
    let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2);

-    // TODO: Make this a parameter or sweep this
-    // All these little parameters would be very handy to write an
-    // exploration/compilation program which would try to minimize latency by
-    // playing with these.
-    let kogge_blk_w = 10;
-    let ripple = true;
-
    {
        let props = prog.params();
        let tfhe_params: asm::DigitParameters = props.clone().into();
@@ -429,19 +427,20 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
            })
            .collect::<Vec<_>>();

-        if ripple {
+        if let Some(blk_w) = kogge_blk_w {
+            kogge::add(prog, dst_to, src_to, src_amount.clone(), None, blk_w)
+                + kogge::sub(prog, dst_from, src_from, src_amount, blk_w)
+        } else { // Default to ripple carry
            kogge::ripple_add(dst_to, src_to, src_amount.clone(), None)
                + kogge::ripple_sub(prog, dst_from, src_from, src_amount)
-        } else {
-            kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w)
-                + kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w)
        }
    }
 }

 /// A SIMD implementation of add for maximum throughput
+/// NB: No use of kogge_blk_w here, impl force to use ripple carry
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl {
+pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8, _kogge_blk_w: Option<usize>) -> Rtl {
    // Allocate metavariables:
    let dst = prog.iop_template_var(OperandKind::Dst, i);
    let src_a = prog.iop_template_var(OperandKind::Src, 2 * i);
@@ -899,13 +898,13 @@ fn bw_inv(prog: &mut Program, b: Vec<VarCell>) -> Vec<VarCell> {
 /// Maybe this should go into a SIMD firmware implementation... At some point we
 /// would need a mechanism to choose between implementations on the fly to make
 /// real good use of all of this.
-fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F)
+fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F, kogge_blk_w: Option<usize>)
 where
-    F: Fn(&mut Program, u8) -> Rtl,
+    F: Fn(&mut Program, u8, Option<usize>) -> Rtl,
 {
    (0..batch_size)
        .map(|i| i as u8)
-        .map(|i| rtl_closure(prog, i))
+        .map(|i| rtl_closure(prog, i, kogge_blk_w))
        .sum::<Rtl>()
        .add_to_prog(prog);
 }
--- a/scripts/check_memory_errors.sh
+++ b/scripts/check_memory_errors.sh
@@ -32,10 +32,9 @@ fi
 RUSTFLAGS="$RUSTFLAGS" cargo nextest list --cargo-profile "${CARGO_PROFILE}" \
          --features=integer,internal-keycache,gpu-debug,zk-pok -p tfhe &> /tmp/test_list.txt

-# Filter the tests to get only the HL and a subset of core crypto ones
-TESTS_TO_RUN=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep -E 'high_level_api::.*gpu.*|core_crypto::.*gpu.*' | grep -v 'array' | grep -v 'modulus_switch' | grep -v '3_3' | grep -v 'noise_distribution' | grep -v 'flip')
-
 if [[ "${RUN_VALGRIND}" == "1" ]]; then
+  TESTS_TO_RUN=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep -E 'high_level_api::.*gpu.*' | grep -v 'array' | grep -v 'flip')
+
  # Build the tests but don't run them
  RUSTFLAGS="$RUSTFLAGS" cargo test --no-run --profile "${CARGO_PROFILE}" \
    --features=integer,internal-keycache,gpu-debug,zk-pok -p tfhe
@@ -57,6 +56,7 @@ if [[ "${RUN_VALGRIND}" == "1" ]]; then
 fi

 if [[ "${RUN_COMPUTE_SANITIZER}" == "1" ]]; then
+  TESTS_TO_RUN=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep -E 'high_level_api::.*gpu.*|core_crypto::.*gpu.*' | grep -v 'array' | grep -v 'modulus_switch' | grep -v '3_3' | grep -v 'noise_distribution' | grep -v 'flip')
  # Build the tests but don't run them
  RUSTFLAGS="$RUSTFLAGS" cargo test --no-run --profile "${CARGO_PROFILE}" \
    --features=integer,internal-keycache,gpu,zk-pok -p tfhe
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -28,13 +28,12 @@ pub fn transfer_whitepaper<FheType>(
    amount: &FheType,
 ) -> (FheType, FheType)
 where
-    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + FheTrivialEncrypt<u64>,
-    FheBool: IfThenElse<FheType>,
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenZero<FheType>,
    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
 {
    let has_enough_funds = (from_amount).ge(amount);
-    let zero_amount = FheType::encrypt_trivial(0u64);
-    let amount_to_transfer = has_enough_funds.select(amount, &zero_amount);
+    let amount_to_transfer = has_enough_funds.if_then_zero(amount);

    let new_to_amount = to_amount + &amount_to_transfer;
    let new_from_amount = from_amount - &amount_to_transfer;
@@ -51,12 +50,13 @@ pub fn par_transfer_whitepaper<FheType>(
 where
    FheType:
        Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Send + Sync + FheTrivialEncrypt<u64>,
-    FheBool: IfThenElse<FheType>,
+    FheBool: IfThenZero<FheType>,
    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
 {
    let has_enough_funds = (from_amount).ge(amount);
-    let zero_amount = FheType::encrypt_trivial(0u64);
-    let amount_to_transfer = has_enough_funds.select(amount, &zero_amount);
+    //let zero_amount = FheType::encrypt_trivial(0u64);
+    //let amount_to_transfer = has_enough_funds.select(amount, &zero_amount);
+    let amount_to_transfer = has_enough_funds.if_then_zero(amount);

    let (new_to_amount, new_from_amount) = rayon::join(
        || to_amount + &amount_to_transfer,
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
@@ -33,8 +33,8 @@ impl ProofConfig {

 fn default_proof_config() -> Vec<ProofConfig> {
    vec![
-        ProofConfig::new(64usize, &[64usize]),
-        ProofConfig::new(2048, &[4 * 64, 10 * 64, 2048]),
+        ProofConfig::new(64, &[64]),
+        ProofConfig::new(2048, &[64, 4 * 64, 2048]),
        ProofConfig::new(4096, &[4096]),
    ]
 }
--- a/tfhe/src/high_level_api/booleans/base.rs
+++ b/tfhe/src/high_level_api/booleans/base.rs
@@ -7,7 +7,7 @@ use crate::high_level_api::integers::{FheInt, FheIntId, FheUint, FheUintId};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::re_randomization::ReRandomizationMetadata;
 use crate::high_level_api::traits::{
-    FheEq, Flip, IfThenElse, ReRandomize, ScalarIfThenElse, Tagged,
+    FheEq, Flip, IfThenElse, IfThenZero, ReRandomize, ScalarIfThenElse, Tagged,
 };
 use crate::high_level_api::{global_state, CompactPublicKey};
 use crate::integer::block_decomposition::DecomposableInto;
@@ -552,6 +552,66 @@ where
    }
 }

+impl<Id> IfThenZero<FheUint<Id>> for FheBool
+where
+    Id: FheUintId,
+{
+    /// Conditional selection.
+    ///
+    /// The output value returned depends on the value of `self`.
+    ///
+    /// - if `self` is true, the output will have the value of `ct_then`
+    /// - if `self` is false, the output will be an encryption of 0
+    fn if_then_zero(&self, ct_then: &FheUint<Id>) -> FheUint<Id> {
+        global_state::with_internal_keys(|sks| match sks {
+            InternalServerKey::Cpu(cpu_sks) => {
+                let ct_condition = self;
+                let mut ct_out = ct_then.ciphertext.on_cpu().clone();
+                cpu_sks.pbs_key().zero_out_if_condition_is_false(
+                    &mut ct_out,
+                    &ct_condition.ciphertext.on_cpu().0,
+                );
+                FheUint::new(
+                    ct_out,
+                    cpu_sks.tag.clone(),
+                    ReRandomizationMetadata::default(),
+                )
+            }
+            #[cfg(feature = "gpu")]
+            InternalServerKey::Cuda(_) => {
+                panic!("Cuda does not support if_then_zero")
+            }
+            #[cfg(feature = "hpu")]
+            InternalServerKey::Hpu(device) => {
+                let hpu_then = ct_then.ciphertext.on_hpu(device);
+                let hpu_cond = self.ciphertext.on_hpu(device);
+
+                let (opcode, proto) = {
+                    let asm_iop = &hpu_asm::iop::IOP_IF_THEN_ZERO;
+                    (
+                        asm_iop.opcode(),
+                        &asm_iop.format().expect("Unspecified IOP format").proto,
+                    )
+                };
+                // These clones are cheap are they are just Arc
+                let hpu_result = HpuRadixCiphertext::exec(
+                    proto,
+                    opcode,
+                    &[hpu_then.clone(), hpu_cond.clone()],
+                    &[],
+                )
+                .pop()
+                .unwrap();
+                FheUint::new(
+                    hpu_result,
+                    device.tag.clone(),
+                    ReRandomizationMetadata::default(),
+                )
+            }
+        })
+    }
+}
+
 impl<Id: FheIntId> IfThenElse<FheInt<Id>> for FheBool {
    /// Conditional selection.
    ///
--- a/tfhe/src/high_level_api/prelude.rs
+++ b/tfhe/src/high_level_api/prelude.rs
@@ -11,6 +11,7 @@ pub use crate::high_level_api::traits::{
    FheOrd, FheTrivialEncrypt, FheTryEncrypt, FheTryTrivialEncrypt, FheWait, Flip, IfThenElse,
    OverflowingAdd, OverflowingMul, OverflowingNeg, OverflowingSub, ReRandomize, RotateLeft,
    RotateLeftAssign, RotateRight, RotateRightAssign, ScalarIfThenElse, SquashNoise, Tagged,
+    IfThenZero,
 };
 #[cfg(feature = "hpu")]
 pub use crate::high_level_api::traits::{FheHpu, HpuHandle};
--- a/tfhe/src/high_level_api/traits.rs
+++ b/tfhe/src/high_level_api/traits.rs
@@ -149,6 +149,10 @@ pub trait IfThenElse<Ciphertext> {
    }
 }

+pub trait IfThenZero<Ciphertext> {
+    fn if_then_zero(&self, ct_then: &Ciphertext) -> Ciphertext;
+}
+
 pub trait ScalarIfThenElse<Lhs, Rhs> {
    type Output;

--- a/tfhe/web_wasm_parallel_tests/worker.js
+++ b/tfhe/web_wasm_parallel_tests/worker.js
@@ -656,18 +656,6 @@ async function compactPublicKeyZeroKnowledgeBench() {
        ShortintCompactPublicKeyEncryptionParametersName.PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
      ),
    },
-    {
-      zk_scheme: "ZKV1",
-      name: shortint_params_name(
-        ShortintParametersName.PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-      ),
-      block_params: new ShortintParameters(
-        ShortintParametersName.PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-      ),
-      casting_params: new ShortintCompactPublicKeyEncryptionParameters(
-        ShortintCompactPublicKeyEncryptionParametersName.V1_5_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
-      ),
-    },
  ];

  let bench_results = {};
@@ -695,8 +683,8 @@ async function compactPublicKeyZeroKnowledgeBench() {
    // Proof configuration:
    let proof_configs = [
      { crs_bit_size: 64, bits_to_encrypt: [64] },
-      { crs_bit_size: 640, bits_to_encrypt: [640] },
-      { crs_bit_size: 2048, bits_to_encrypt: [2048, 64 * 4] }, // 64 * 4 is a production use-case
+      // 64 * 4 is a production use-case
+      { crs_bit_size: 2048, bits_to_encrypt: [64, 4 * 64, 2048] },
      { crs_bit_size: 4096, bits_to_encrypt: [4096] },
    ];
Author	SHA1	Message	Date
Baptiste Roux	52b8e81ccb	fix(hpu): Correctly select adder configuration in ERC_20/ERC_20_SIMD Add knobs to select ripple or kogge adder in ERC_20/ERC_20_SIMD. Previously, it was hardcoded to ripple carry and thus degraded latency performance of ERC_20.	2025-12-24 10:38:38 +01:00
Baptiste Roux	b19a7773bb	feat: Add IfThenZero impl for Cpu	2025-12-24 10:38:38 +01:00
pgardratzama	0342b0466d	chore(hpu): fix panic msg	2025-12-24 10:38:38 +01:00
pgardratzama	edc9ef0026	fix(hpu): fix whitepaper erc20 for HPU using if_then_zero	2025-12-24 10:38:38 +01:00
Guillermo Oyarzun	92df46f8f2	fix(gpu): return to 64 regs in multi-bit pbs	2025-12-23 11:51:00 +01:00
David Testé	effb7ada6d	chore(ci): fix argument name passed to data_extractor	2025-12-18 18:09:34 +01:00
Agnes Leroy	49be544297	fix(gpu): fix cpu memory leak in expand and rerand	2025-12-18 16:33:23 +01:00
David Testé	23600eb8e1	chore(ci): split gpu documentation benchmarks execution This is done to mitigate H100x8-SXM5 server scarcity.	2025-12-18 14:56:15 +01:00
Agnes Leroy	9708cc7fe9	chore(gpu): remove core crypto from valgrind run	2025-12-18 13:01:12 +01:00
dependabot[bot]	4cdfccb659	chore(deps): bump actions/setup-node from 6.0.0 to 6.1.0 Bumps [actions/setup-node](https://github.com/actions/setup-node) from 6.0.0 to 6.1.0. - [Release notes](https://github.com/actions/setup-node/releases) - [Commits](`2028fbc5c2...395ad32622`) --- updated-dependencies: - dependency-name: actions/setup-node dependency-version: 6.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-18 11:05:56 +01:00
dependabot[bot]	031c3fe34f	chore(deps): bump actions/checkout from 6.0.0 to 6.0.1 Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.0 to 6.0.1. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](`1af3b93b68...8e8c483db8`) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 6.0.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-18 11:05:47 +01:00
dependabot[bot]	ea99307cf5	chore(deps): bump actions/stale from 10.1.0 to 10.1.1 Bumps [actions/stale](https://github.com/actions/stale) from 10.1.0 to 10.1.1. - [Release notes](https://github.com/actions/stale/releases) - [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md) - [Commits](`5f858e3efb...997185467f`) --- updated-dependencies: - dependency-name: actions/stale dependency-version: 10.1.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-12-18 11:05:36 +01:00
Enzo Di Maria	ca2a79f1fb	refactor(gpu): Threshold for multi-GPU with Classical PBS	2025-12-18 09:27:09 +01:00
Enzo Di Maria	0a59e86675	fix(gpu): Using tbc for classical 64 bits pbs on H100	2025-12-17 19:18:01 +01:00
Nicolas Sarlin	312ce494bf	chore(zk): add 1 * 64 benches with production CRS	2025-12-17 15:06:37 +01:00
Nicolas Sarlin	5f2e7e31f1	chore(zk): align wasm bench and integer bench	2025-12-17 15:06:37 +01:00