feat(gpu): add memory management

fix: max streams 8
fix(gpu): deadlock
2026-01-11 07:38:08 -05:00 · 2025-09-18 22:37:51 +02:00 · 2025-09-18 22:09:30 +02:00 · 2025-09-18 19:17:25 +02:00 · 2025-09-18 14:47:20 +02:00 · 2025-09-18 13:59:00 +02:00
256 changed files with 16801 additions and 8991 deletions
--- a/.cargo/audit.toml
+++ b/.cargo/audit.toml
@@ -0,0 +1,12 @@
+[advisories]
+ignore = [
+    # Ignoring unmaintained 'paste' advisory as it is a widely used, low-risk build dependency.
+    "RUSTSEC-2024-0436",
+]
+
+[output]
+# Deny advisories that are warnings by default.
+# At the moment this works if we allow paste, we might want to disable this in the future if it
+# becomes too tedious
+deny = ["warnings"]
+quiet = false
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -1,5 +1,5 @@
 # Add labels in pull request
-name: PR label manager
+name: approve_label

 on:
  pull_request:
@@ -11,6 +11,7 @@ permissions: {}

 jobs:
  trigger-tests:
+    name: approve_label/trigger-tests
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -1,5 +1,5 @@
 # Run backward compatibility tests
-name: Backward compatibility Tests on CPU
+name: aws_tfhe_backward_compat_tests

 env:
  CARGO_TERM_COLOR: always
@@ -22,13 +22,16 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+  push:
+    branches:
+      - main

 permissions:
  contents: read

 jobs:
  setup-instance:
-    name: Setup instance (backward-compat-tests)
+    name: aws_tfhe_backward_compat_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -53,11 +56,11 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  backward-compat-tests:
-    name: Backward compatibility tests
+    name: aws_tfhe_backward_compat_tests/backward-compat-tests (bpr)
    needs: [ setup-instance ]
    concurrency:
-      group: ${{ github.workflow_ref }}
-      cancel-in-progress: true
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
@@ -123,7 +126,7 @@ jobs:
          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (backward-compat-tests)
+    name: aws_tfhe_backward_compat_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, backward-compat-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -1,5 +1,5 @@
 # Run a small subset of tests to ensure quick feedback.
-name: Fast AWS Tests on CPU
+name: aws_tfhe_fast_tests

 env:
  CARGO_TERM_COLOR: always
@@ -29,6 +29,7 @@ permissions:

 jobs:
  should-run:
+    name: aws_tfhe_fast_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -68,7 +69,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            dependencies:
@@ -132,7 +133,7 @@ jobs:
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

  setup-instance:
-    name: Setup instance (fast-tests)
+    name: aws_tfhe_fast_tests/setup-instance
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
    needs: should-run
@@ -288,7 +289,7 @@ jobs:
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (fast-tests)
+    name: aws_tfhe_fast_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, fast-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -1,4 +1,4 @@
-name: AWS Unsigned Integer Tests on CPU
+name: aws_tfhe_integer_tests

 env:
  CARGO_TERM_COLOR: always
@@ -35,6 +35,7 @@ permissions:

 jobs:
  should-run:
+    name: aws_tfhe_integer_tests/should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
@@ -55,7 +56,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            integer:
@@ -69,7 +70,7 @@ jobs:
              - .github/workflows/aws_tfhe_integer_tests.yml

  setup-instance:
-    name: Setup instance (unsigned-integer-tests)
+    name: aws_tfhe_integer_tests/setup-instance
    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
@@ -100,7 +101,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  unsigned-integer-tests:
-    name: Unsigned integer tests
+    name: aws_tfhe_integer_tests/unsigned-integer-tests
    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -156,7 +157,7 @@ jobs:
          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (unsigned-integer-tests)
+    name: aws_tfhe_integer_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, unsigned-integer-tests]
    runs-on: ubuntu-latest
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -1,4 +1,4 @@
-name: Run noise checks on CPU
+name: aws_tfhe_noise_checks

 env:
  CARGO_TERM_COLOR: always
@@ -25,7 +25,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (noise-checks)
+    name: aws_tfhe_noise_checks/setup-instance
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -52,7 +52,7 @@ jobs:
          exit 1

  noise-checks:
-    name: CPU noise checks
+    name: aws_tfhe_noise_checks/noise-checks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440
@@ -90,7 +90,7 @@ jobs:
          SLACK_MESSAGE: "Noise checks tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (noise-checks)
+    name: aws_tfhe_noise_checks/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, noise-checks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -1,4 +1,4 @@
-name: AWS Signed Integer Tests on CPU
+name: aws_tfhe_signed_integer_tests

 env:
  CARGO_TERM_COLOR: always
@@ -35,6 +35,7 @@ permissions:

 jobs:
  should-run:
+    name: aws_tfhe_signed_integer_tests/should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
@@ -56,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            integer:
@@ -70,7 +71,7 @@ jobs:
              - .github/workflows/aws_tfhe_signed_integer_tests.yml

  setup-instance:
-    name: Setup instance (unsigned-integer-tests)
+    name: aws_tfhe_signed_integer_tests/setup-instance
    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
@@ -101,7 +102,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  signed-integer-tests:
-    name: Signed integer tests
+    name: aws_tfhe_signed_integer_tests/signed-integer-tests
    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -161,7 +162,7 @@ jobs:
          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (signed-integer-tests)
+    name: aws_tfhe_signed_integer_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, signed-integer-tests]
    runs-on: ubuntu-latest
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -1,4 +1,4 @@
-name: AWS Tests on CPU
+name: aws_tfhe_tests

 env:
  CARGO_TERM_COLOR: always
@@ -32,6 +32,7 @@ permissions:

 jobs:
  should-run:
+    name: aws_tfhe_tests/should-run
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -77,7 +78,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            dependencies:
@@ -141,7 +142,7 @@ jobs:
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

  setup-instance:
-    name: Setup instance (cpu-tests)
+    name: aws_tfhe_tests/setup-instance
    if: github.event_name != 'pull_request' ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
    needs: should-run
@@ -169,7 +170,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cpu-tests:
-    name: CPU tests
+    name: aws_tfhe_tests/cpu-tests
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    needs: [ should-run, setup-instance ]
@@ -268,7 +269,7 @@ jobs:
          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cpu-tests)
+    name: aws_tfhe_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cpu-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -1,4 +1,4 @@
-name: AWS WASM Tests on CPU
+name: aws_tfhe_wasm_tests

 env:
  CARGO_TERM_COLOR: always
@@ -28,7 +28,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (wasm-tests)
+    name: aws_tfhe_wasm_tests/setup-instance
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
@@ -54,7 +54,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  wasm-tests:
-    name: WASM tests
+    name: aws_tfhe_wasm_tests/wasm-tests
    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}
@@ -137,7 +137,7 @@ jobs:
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (wasm-tests)
+    name: aws_tfhe_wasm_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, wasm-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -1,5 +1,5 @@
 # Run boolean benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Boolean benchmarks
+name: benchmark_boolean

 on:
  workflow_dispatch:
@@ -23,7 +23,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (boolean-benchmarks)
+    name: benchmark_boolean/setup-instance
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -42,7 +42,7 @@ jobs:
          profile: bench

  boolean-benchmarks:
-    name: Execute boolean benchmarks in EC2
+    name: benchmark_boolean/boolean-benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -132,7 +132,7 @@ jobs:
          SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (boolean-benchmarks)
+    name: benchmark_boolean/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, boolean-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -1,5 +1,5 @@
 # Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Core crypto benchmarks
+name: benchmark_core_crypto

 on:
  workflow_dispatch:
@@ -23,7 +23,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (core-crypto-benchmarks)
+    name: benchmark_core_crypto/setup-instance
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -42,7 +42,7 @@ jobs:
          profile: bench

  core-crypto-benchmarks:
-    name: Execute core crypto benchmarks in EC2
+    name: benchmark_core_crypto/core-crypto-benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -125,7 +125,7 @@ jobs:
          SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (core-crypto-benchmarks)
+    name: benchmark_core_crypto/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, core-crypto-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -0,0 +1,152 @@
+# Run sizes benchmarks on an instance and return parsed results to Slab CI bot.
+name: Ciphertext and Keys sizes benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Monthly benchmarks will be triggered each 24th of the month at 1a.m.
+    - cron: '0 1 24 * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+jobs:
+  setup-instance:
+    name: Setup instance (sizes-benchmarks)
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  sizes-benchmarks:
+    name: Execute sizes client benchmarks
+    needs: setup-instance
+    if: needs.setup-instance.result != 'skipped'
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Measure public key and ciphertext sizes in HL Api
+        run: |
+          make measure_hlapi_compact_pk_ct_sizes
+
+      - name: Parse key and ciphertext sizes results
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/hlapi_ct_key_sizes.csv "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "m6i.32xlarge" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --object-sizes
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Measure key sizes in shortint
+        run: |
+          make measure_shortint_key_sizes
+
+      - name: Parse key sizes results
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_ct_key_sizes
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Sizes benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (sizes-benchmarks)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, sizes-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (sizes-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -1,5 +1,5 @@
 # Run all DEX benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: DEX benchmarks
+name: benchmark_dex

 on:
  workflow_dispatch:
@@ -22,7 +22,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (dex-benchmarks)
+    name: benchmark_dex/setup-instance
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -41,7 +41,7 @@ jobs:
          profile: bench

  dex-benchmarks:
-    name: Execute DEX benchmarks
+    name: benchmark_dex/dex-benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -146,7 +146,7 @@ jobs:
          SLACK_MESSAGE: "DEX benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (dex-benchmarks)
+    name: benchmark_dex/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, dex-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -1,5 +1,5 @@
 # Run all ERC20 benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: ERC20 benchmarks
+name: benchmark_erc20

 on:
  workflow_dispatch:
@@ -23,7 +23,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (erc20-benchmarks)
+    name: benchmark_erc20/setup-instance
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -42,7 +42,7 @@ jobs:
          profile: bench

  erc20-benchmarks:
-    name: Execute ERC20 benchmarks
+    name: benchmark_erc20/erc20-benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -129,7 +129,7 @@ jobs:
          SLACK_MESSAGE: "ERC20 benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (erc20-benchmarks)
+    name: benchmark_erc20/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, erc20-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -1,5 +1,5 @@
 # Run CUDA benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
-name: Cuda benchmarks
+name: benchmark_gpu

 on:
  workflow_dispatch:
@@ -66,6 +66,7 @@ permissions: {}

 jobs:
  parse-inputs:
+    name: benchmark_gpu/parse-inputs
    runs-on: ubuntu-latest
    outputs:
      profile: ${{ steps.parse_profile.outputs.profile }}
@@ -90,7 +91,7 @@ jobs:
          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"

  run-benchmarks:
-    name: Run benchmarks
+    name: benchmark_gpu/run-benchmarks
    needs: parse-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -1,5 +1,5 @@
 # Run benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
-name: TFHE Cuda Backend - 4090 benchmarks
+name: benchmark_gpu_4090

 env:
  CARGO_TERM_COLOR: always
@@ -27,7 +27,7 @@ permissions:

 jobs:
  cuda-integer-benchmarks:
-    name: Cuda integer benchmarks (RTX 4090)
+    name: benchmark_gpu_4090/cuda-integer-benchmarks
    if: ${{ github.event_name == 'workflow_dispatch' ||
      github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs' ||
      contains(github.event.label.name, '4090_bench') }}
@@ -111,7 +111,7 @@ jobs:
          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  cuda-core-crypto-benchmarks:
-    name: Cuda core crypto benchmarks  (RTX 4090)
+    name: benchmark_gpu_4090/cuda-core-crypto-benchmarks
    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
    needs: cuda-integer-benchmarks
    concurrency:
@@ -195,7 +195,7 @@ jobs:
          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  remove_github_label:
-    name: Remove 4090 bench label
+    name: benchmark_gpu_4090/remove_github_label
    if: ${{ always() && github.event_name == 'pull_request' }}
    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -1,5 +1,5 @@
 # Run benchmarks on CUDA instance and return parsed results to Slab CI bot.
-name: Cuda benchmarks - common
+name: benchmark_gpu_common

 on:
  workflow_call:
@@ -63,7 +63,7 @@ permissions: {}

 jobs:
  prepare-matrix:
-    name: Prepare operations matrix
+    name: benchmark_gpu_common/prepare-matrix
    runs-on: ubuntu-latest
    outputs:
      command: ${{ steps.set_command.outputs.command }}
@@ -141,7 +141,7 @@ jobs:
          echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
-    name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
+    name: benchmark_gpu_common/setup-instance
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
@@ -185,7 +185,7 @@ jobs:

  # Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
  install-dependencies:
-    name: Install dependencies
+    name: benchmark_gpu_common/install-dependencies
    needs: [ setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -210,7 +210,7 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

  cuda-benchmarks:
-    name: Cuda benchmarks (${{ inputs.profile }})
+    name: benchmark_gpu_common/cuda-benchmarks
    needs: [ prepare-matrix, setup-instance, install-dependencies ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
@@ -329,7 +329,7 @@ jobs:
          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
-    name: Slack Notification
+    name: benchmark_gpu_common/slack-notify
    needs: [ setup-instance, cuda-benchmarks ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-benchmarks.result != 'skipped' && failure() }}
@@ -342,7 +342,7 @@ jobs:
          SLACK_MESSAGE: "Cuda benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
+    name: benchmark_gpu_common/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_gpu_dex.yml
+++ b/.github/workflows/benchmark_gpu_dex.yml
@@ -1,5 +1,5 @@
 # Run CUDA DEX benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
-name: Cuda DEX benchmarks
+name: benchmark_gpu_dex/

 on:
  workflow_dispatch:
@@ -23,6 +23,7 @@ permissions: {}

 jobs:
  parse-inputs:
+    name: benchmark_gpu_dex/parse-inputs
    runs-on: ubuntu-latest
    outputs:
      profile: ${{ steps.parse_profile.outputs.profile }}
@@ -47,7 +48,7 @@ jobs:
          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"

  run-benchmarks:
-    name: Run benchmarks
+    name: benchmark_gpu_dex/run-benchmarks
    needs: parse-inputs
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -1,5 +1,5 @@
 # Run DEX benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Cuda DEX benchmarks - common
+name: benchmark_gpu_dex_common

 on:
  workflow_call:
@@ -47,7 +47,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (cuda-dex-benchmarks)
+    name: benchmark_gpu_dex_common/setup-instance
    runs-on: ubuntu-latest
    if:  github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -91,7 +91,7 @@ jobs:
          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"

  cuda-dex-benchmarks:
-    name: Cuda DEX benchmarks (${{ inputs.profile }})
+    name: benchmark_gpu_dex_common/cuda-dex-benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -177,7 +177,7 @@ jobs:
          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
-    name: Slack Notification
+    name: benchmark_gpu_dex_common/slack-notify
    needs: [ setup-instance, cuda-dex-benchmarks ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-dex-benchmarks.result != 'skipped' && failure() }}
@@ -190,7 +190,7 @@ jobs:
          SLACK_MESSAGE: "Cuda DEX benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-dex-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-dex-${{ inputs.profile }}-benchmarks)
+    name: benchmark_gpu_dex_common/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-dex-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_gpu_dex_weekly.yml
+++ b/.github/workflows/benchmark_gpu_dex_weekly.yml
@@ -1,5 +1,5 @@
 # Run CUDA DEX benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
-name: Cuda DEX weekly benchmarks
+name: benchmark_gpu_dex_weekly

 on:
  schedule:
@@ -10,7 +10,7 @@ permissions: {}

 jobs:
  run-benchmarks-1-h100:
-    name: Run benchmarks (1xH100)
+    name: benchmark_gpu_dex_weekly/run-benchmarks-1-h100
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
@@ -27,7 +27,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-2-h100:
-    name: Run benchmarks (2xH100)
+    name: benchmark_gpu_dex_weekly/run-benchmarks-2-h100
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
@@ -44,7 +44,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100:
-    name: Run benchmarks (8xH100)
+    name: benchmark_gpu_dex_weekly/run-benchmarks-8-h100
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -1,5 +1,5 @@
 # Run CUDA ERC20 benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
-name: Cuda ERC20 benchmarks
+name: benchmark_gpu_erc20

 on:
  workflow_dispatch:
@@ -24,6 +24,7 @@ permissions: {}

 jobs:
  parse-inputs:
+    name: benchmark_gpu_erc20/parse-inputs
    runs-on: ubuntu-latest
    outputs:
      profile: ${{ steps.parse_profile.outputs.profile }}
@@ -48,7 +49,7 @@ jobs:
          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"

  run-benchmarks:
-    name: Run benchmarks
+    name: benchmark_gpu_erc20/run-benchmarks
    needs: parse-inputs
    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
    with:
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -1,5 +1,5 @@
 # Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Cuda ERC20 benchmarks - common
+name: benchmark_gpu_erc20_common

 on:
  workflow_call:
@@ -48,7 +48,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (cuda-erc20-benchmarks)
+    name: benchmark_gpu_erc20_common/setup-instance
    runs-on: ubuntu-latest
    if:  github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -92,7 +92,7 @@ jobs:
          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"

  cuda-erc20-benchmarks:
-    name: Cuda ERC20 benchmarks (${{ inputs.profile }})
+    name: benchmark_gpu_erc20_common/cuda-erc20-benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -178,7 +178,7 @@ jobs:
          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
-    name: Slack Notification
+    name: benchmark_gpu_erc20_common/slack-notify
    needs: [ setup-instance, cuda-erc20-benchmarks ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
@@ -191,7 +191,7 @@ jobs:
          SLACK_MESSAGE: "Cuda ERC20 benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
+    name: benchmark_gpu_erc20_common/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_gpu_erc20_weekly.yml
+++ b/.github/workflows/benchmark_gpu_erc20_weekly.yml
@@ -1,5 +1,5 @@
 # Run CUDA ERC20 benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
-name: Cuda ERC20 weekly benchmarks
+name: benchmark_gpu_erc20_weekly

 on:
  schedule:
@@ -11,7 +11,7 @@ permissions: {}

 jobs:
  run-benchmarks-1-h100:
-    name: Run benchmarks (1xH100)
+    name: benchmark_gpu_erc20_weekly/run-benchmarks-1-h100
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
    with:
@@ -28,7 +28,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-2-h100:
-    name: Run benchmarks (2xH100)
+    name: benchmark_gpu_erc20_weekly/run-benchmarks-2-h100
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
    with:
@@ -45,7 +45,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100:
-    name: Run benchmarks (8xH100)
+    name: benchmark_gpu_erc20_weekly/run-benchmarks-8-h100
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
    with:
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -1,5 +1,5 @@
 # Run CUDA benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
-name: Cuda weekly benchmarks
+name: benchmark_gpu_weekly

 on:
  schedule:
@@ -11,7 +11,7 @@ permissions: {}

 jobs:
  run-benchmarks-8-h100-sxm5-integer:
-    name: Run integer benchmarks (8xH100-SXM5)
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -32,7 +32,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100-sxm5-integer-compression:
-    name: Run integer compression benchmarks (8xH100-SXM5)
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -53,7 +53,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100-sxm5-integer-zk:
-    name: Run integer zk benchmarks (8xH100-SXM5)
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -74,7 +74,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100-sxm5-noise-squash:
-    name: Run integer zk benchmarks (8xH100-SXM5)
+    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -95,7 +95,7 @@ jobs:
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-1-h100-core-crypto:
-    name: Run core-crypto benchmarks (1xH100)
+    name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -1,9 +1,12 @@
 # Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
-name: Hpu Integer Benchmarks
+name: benchmark_hpu_integer

 on:
  workflow_dispatch:
    inputs:
+      all_precisions:
+        description: "Run all precisions"
+        type: boolean
      bench_type:
        description: "Benchmarks type"
        type: choice
@@ -19,6 +22,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  FAST_BENCH: TRUE

 permissions: {}

@@ -52,7 +56,7 @@ jobs:
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  integer-benchmarks-hpu:
-    name: Execute integer & erc20 benchmarks for HPU backend
+    name: benchmark_hpu_integer/integer-benchmarks-hpu
    needs: prepare-matrix
    runs-on: v80-desktop
    concurrency:
@@ -102,6 +106,11 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

+      - name: Should run benchmarks with all precisions
+        if: inputs.all_precisions
+        run: |
+          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
+
      - name: Run benchmarks
        run: |
          make pull_hpu_files
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -1,5 +1,5 @@
 # Run all integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Integer benchmarks
+name: benchmark_integer

 on:
  workflow_dispatch:
@@ -41,7 +41,7 @@ permissions: {}

 jobs:
  prepare-matrix:
-    name: Prepare operations matrix
+    name: benchmark_integer/prepare-matrix
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -87,7 +87,7 @@ jobs:
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
-    name: Setup instance (integer-benchmarks)
+    name: benchmark_integer/setup-instance
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
@@ -105,7 +105,7 @@ jobs:
          profile: bench

  integer-benchmarks:
-    name: Execute integer benchmarks for all operations flavor
+    name: benchmark_integer/integer-benchmarks
    needs: [ prepare-matrix, setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -211,7 +211,7 @@ jobs:
          SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (integer-benchmarks)
+    name: benchmark_integer/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, integer-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -0,0 +1,298 @@
+# Run performance regression benchmarks and return parsed results to associated pull-request.
+name: benchmark_perf_regression
+
+on:
+  issue_comment:
+    types: [ created ]
+  pull_request:
+    types: [ labeled ]
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: { }
+
+jobs:
+  verify-actor:
+    name: benchmark_perf_regression/verify-actor
+    if: (github.event_name == 'pull_request' &&
+      (contains(github.event.label.name, 'bench-perfs-cpu') ||
+      contains(github.event.label.name, 'bench-perfs-gpu'))) ||
+      (github.event.issue.pull_request && startsWith(github.event.comment.body, '/bench'))
+    uses: ./.github/workflows/verify_commit_actor.yml
+    secrets:
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  prepare-benchmarks:
+    name: benchmark_perf_regression/prepare-benchmarks
+    needs: verify-actor
+    runs-on: ubuntu-latest
+    outputs:
+      commands: ${{ steps.set_commands.outputs.commands }}
+      slab-backend: ${{ steps.set_slab_details.outputs.backend }}
+      slab-profile: ${{ steps.set_slab_details.outputs.profile }}
+      hardware-name: ${{ steps.get_hardware_name.outputs.name }}
+      custom-env: ${{ steps.get_custom_env.outputs.custom_env }}
+    steps:
+      - name: Checkout tfhe-rs repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Generate cpu benchmarks command from label
+        if: (github.event_name == 'pull_request' && contains(github.event.label.name, 'bench-perfs-cpu'))
+        run: |
+          echo "DEFAULT_BENCH_OPTIONS=--backend cpu" >> "${GITHUB_ENV}"
+
+      - name: Generate cpu benchmarks command from label
+        if: (github.event_name == 'pull_request' && contains(github.event.label.name, 'bench-perfs-gpu'))
+        run: |
+          echo "DEFAULT_BENCH_OPTIONS=--backend gpu" >> "${GITHUB_ENV}"
+
+      # TODO add support for HPU backend
+
+      - name: Generate cargo commands and env from label
+        if: github.event_name == 'pull_request'
+        run: |
+          python3 ci/perf_regression.py parse_profile --issue-comment "/bench ${DEFAULT_BENCH_OPTIONS}"
+          echo "COMMANDS=$(cat ci/perf_regression_generated_commands.json)" >> "${GITHUB_ENV}"
+
+      - name: Dump issue comment into file # To avoid possible code-injection
+        if: github.event_name == 'issue_comment'
+        run: |
+          echo "${COMMENT_BODY}" >> dumped_comment.txt
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+
+      - name: Generate cargo commands and env
+        if: github.event_name == 'issue_comment'
+        run: |
+          python3 ci/perf_regression.py parse_profile --issue-comment "$(cat dumped_comment.txt)"
+          echo "COMMANDS=$(cat ci/perf_regression_generated_commands.json)" >> "${GITHUB_ENV}"
+
+      - name: Set commands output
+        id: set_commands
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          echo "commands=${{ toJSON(env.COMMANDS) }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Set Slab details outputs
+        id: set_slab_details
+        run: |
+          echo "backend=$(cat ci/perf_regression_slab_backend_config.txt)" >> "${GITHUB_OUTPUT}"
+          echo "profile=$(cat ci/perf_regression_slab_profile_config.txt)" >> "${GITHUB_OUTPUT}"
+
+      - name: Get hardware name
+        id: get_hardware_name
+        run: | # zizmor: ignore[template-injection] these interpolations are safe
+          HARDWARE_NAME=$(python3 ci/hardware_finder.py "${{ steps.set_slab_details.outputs.backend }}" "${{ steps.set_slab_details.outputs.profile }}");
+          echo "name=${HARDWARE_NAME}" >> "${GITHUB_OUTPUT}"
+
+      - name: Get custom env vars
+        id: get_custom_env
+        run: |
+          echo "custom_env=$(cat ci/perf_regression_custom_env.sh)" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: benchmark_perf_regression/setup-instance
+    needs: prepare-benchmarks
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: ${{ needs.prepare-benchmarks.outputs.slab-backend }}
+          profile: ${{ needs.prepare-benchmarks.outputs.slab-profile }}
+
+  install-cuda-dependencies-if-required:
+    name: benchmark_perf_regression/install-cuda-dependencies-if-required
+    needs: [ prepare-benchmarks, setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      matrix:
+        # explicit include-based build matrix, of known valid options
+        include:
+          - cuda: "12.8"
+            gcc: 11
+    steps:
+      - name: Checkout tfhe-rs repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        if: needs.prepare-benchmarks.outputs.slab-backend == 'hyperstack'
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
+  regression-benchmarks:
+    name: benchmark_perf_regression/regression-benchmarks
+    needs: [ prepare-benchmarks, setup-instance, install-cuda-dependencies-if-required ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow_ref }}_${{ needs.prepare-benchmarks.outputs.slab-backend }}_${{ needs.prepare-benchmarks.outputs.slab-profile }}
+      cancel-in-progress: true
+    timeout-minutes: 720  # 12 hours
+    strategy:
+      max-parallel: 1
+      matrix:
+        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Export custom env variables
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          {
+              ${{ needs.prepare-benchmarks.outputs.custom-env }}
+          } >> "$GITHUB_ENV"
+
+      # Re-export environment variables as dependencies setup perform this task in the previous job.
+      # Local env variables are cleaned at the end of each job.
+      - name: Export CUDA variables
+        if: needs.prepare-benchmarks.outputs.slab-backend == 'hyperstack'
+        shell: bash
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDA_MODULE_LOADER=EAGER" >> "${GITHUB_ENV}"
+        env:
+          CUDA_PATH: /usr/local/cuda-12.8
+
+      - name: Export gcc and g++ variables
+        if: needs.prepare-benchmarks.outputs.slab-backend == 'hyperstack'
+        shell: bash
+        run: |
+          {
+          echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+          echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
+          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: 11
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run regression benchmarks
+        run: |
+          make  BENCH_CUSTOM_COMMAND="${BENCH_COMMAND}" bench_custom
+        env:
+          BENCH_COMMAND: ${{ matrix.command }}
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "${HARDWARE_NAME}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs \
+          --name-suffix regression \
+          --bench-type "${BENCH_TYPE}"
+        env:
+          REF_NAME: ${{ github.ref_name }}
+          BENCH_TYPE: ${{ env.__TFHE_RS_BENCH_TYPE }}
+          HARDWARE_NAME: ${{ needs.prepare-benchmarks.outputs.hardware-name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_regression
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
+
+  slack-notify:
+    name: benchmark_perf_regression/slack-notify
+    needs: [ prepare-benchmarks, setup-instance, regression-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.regression-benchmarks.result }}
+          SLACK_MESSAGE: "Performance regression benchmarks finished with status: ${{ needs.regression-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  # TODO Add job for regression calculation
+
+  teardown-instance:
+    name: benchmark_perf_regression/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, regression-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (regression-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -1,5 +1,5 @@
 # Run all shortint benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Shortint full benchmarks
+name: benchmark_shortint

 on:
  workflow_dispatch:
@@ -27,7 +27,7 @@ permissions: {}

 jobs:
  prepare-matrix:
-    name: Prepare operations matrix
+    name: benchmark_shortint/prepare-matrix
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -51,7 +51,7 @@ jobs:
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
-    name: Setup instance (shortint-benchmarks)
+    name: benchmark_shortint/setup-instance
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
@@ -69,7 +69,7 @@ jobs:
          profile: bench

  shortint-benchmarks:
-    name: Execute shortint benchmarks for all operations flavor
+    name: benchmark_shortint/shortint-benchmarks
    needs: [ prepare-matrix, setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -131,19 +131,6 @@ jobs:
        env:
          REF_NAME: ${{ github.ref_name }}

-      # This small benchmark needs to be executed only once.
-      - name: Measure key sizes
-        if: matrix.op_flavor == 'default'
-        run: |
-          make measure_shortint_key_sizes
-
-      - name: Parse key sizes results
-        if: matrix.op_flavor == 'default'
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
-          --object-sizes \
-          --append-results
-
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
@@ -168,7 +155,7 @@ jobs:
          SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (shortint-benchmarks)
+    name: benchmark_shortint/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, shortint-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -1,5 +1,5 @@
 # Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer full benchmarks
+name: benchmark_signed_integer

 on:
  workflow_dispatch:
@@ -41,7 +41,7 @@ permissions: {}

 jobs:
  prepare-matrix:
-    name: Prepare operations matrix
+    name: benchmark_signed_integer/prepare-matrix
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -87,7 +87,7 @@ jobs:
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
-    name: Setup instance (signed-integer-benchmarks)
+    name: benchmark_signed_integer/setup-instance
    needs: prepare-matrix
    runs-on: ubuntu-latest
    outputs:
@@ -105,7 +105,7 @@ jobs:
          profile: bench

  signed-integer-benchmarks:
-    name: Execute signed integer benchmarks for all operations flavor
+    name: benchmark_signed_integer/signed-integer-benchmarks
    needs: [ prepare-matrix, setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
@@ -203,7 +203,7 @@ jobs:
          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (integer-benchmarks)
+    name: benchmark_signed_integer/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, signed-integer-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -1,5 +1,5 @@
 # Run FFT benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: FFT benchmarks
+name: benchmark_tfhe_fft

 env:
  CARGO_TERM_COLOR: always
@@ -27,8 +27,8 @@ on:
 permissions: {}

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (fft-benchmarks)
+  setup-instance:
+    name: benchmark_tfhe_fft/setup-instance
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -45,12 +45,12 @@ jobs:
          profile: bench

  fft-benchmarks:
-    name: Execute FFT benchmarks in EC2
-    needs: setup-ec2
+    name: benchmark_tfhe_fft/fft-benchmarks
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
@@ -124,10 +124,10 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-fft benchmarks failed. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (fft-benchmarks)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, fft-benchmarks ]
+  teardown-instance:
+    name: benchmark_tfhe_fft/teardown-instance
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, fft-benchmarks ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
@@ -138,7 +138,7 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -146,4 +146,4 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (fft-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (fft-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -1,5 +1,5 @@
 # Run NTT benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: NTT benchmarks
+name: benchmark_tfhe_ntt

 env:
  CARGO_TERM_COLOR: always
@@ -27,8 +27,8 @@ on:
 permissions: {}

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (ntt-benchmarks)
+  setup-instance:
+    name: benchmark_tfhe_ntt/setup-instance
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -45,12 +45,12 @@ jobs:
          profile: bench

  ntt-benchmarks:
-    name: Execute NTT benchmarks in EC2
-    needs: setup-ec2
+    name: benchmark_tfhe_ntt/ntt-benchmarks
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
@@ -124,10 +124,10 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-ntt benchmarks failed. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (ntt-benchmarks)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [setup-ec2, ntt-benchmarks]
+  teardown-instance:
+    name: benchmark_tfhe_ntt/teardown-instance
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [setup-instance, ntt-benchmarks]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
@@ -138,7 +138,7 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -146,4 +146,4 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -1,5 +1,5 @@
 # Run benchmarks of the tfhe-zk-pok crate on an instance and return parsed results to Slab CI bot.
-name: tfhe-zk-pok benchmarks
+name: benchmark_tfhe_zk_pok

 on:
  workflow_dispatch:
@@ -35,6 +35,7 @@ permissions: {}

 jobs:
  should-run:
+    name: benchmark_tfhe_zk_pok/should-run
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
@@ -50,7 +51,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            zk_pok:
@@ -58,7 +59,7 @@ jobs:
              - .github/workflows/benchmark_tfhe_zk_pok.yml

  setup-instance:
-    name: Setup instance (tfhe-zk-pok-benchmarks)
+    name: benchmark_tfhe_zk_pok/setup-instance
    runs-on: ubuntu-latest
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
@@ -81,7 +82,7 @@ jobs:
          profile: bench

  tfhe-zk-pok-benchmarks:
-    name: Execute tfhe-zk-pok benchmarks
+    name: benchmark_tfhe_zk_pok/tfhe-zk-pok-benchmarks
    if: needs.setup-instance.result != 'skipped'
    needs: setup-instance
    concurrency:
@@ -173,7 +174,7 @@ jobs:
          SLACK_MESSAGE: "tfhe-zk-pok benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (tfhe-zk-pok-benchmarks)
+    name: benchmark_tfhe_zk_pok/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, tfhe-zk-pok-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -1,5 +1,5 @@
 # Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
-name: WASM client benchmarks
+name: benchmark_wasm_client

 on:
  workflow_dispatch:
@@ -26,6 +26,7 @@ permissions: {}

 jobs:
  should-run:
+    name: benchmark_wasm_client/should-run
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
@@ -44,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            wasm_bench:
@@ -57,7 +58,7 @@ jobs:
              - .github/workflows/wasm_client_benchmark.yml

  setup-instance:
-    name: Setup instance (wasm-client-benchmarks)
+    name: benchmark_wasm_client/setup-instance
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
@@ -78,7 +79,7 @@ jobs:
          profile: cpu-small

  wasm-client-benchmarks:
-    name: Execute WASM client benchmarks
+    name: benchmark_wasm_client/wasm-client-benchmarks
    needs: setup-instance
    if: needs.setup-instance.result != 'skipped'
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
@@ -165,19 +166,6 @@ jobs:
        env:
          REF_NAME: ${{ github.ref_name }}

-      # Run these benchmarks only once
-      - name: Measure public key and ciphertext sizes in HL Api
-        if:  matrix.browser == 'chrome'
-        run: |
-          make measure_hlapi_compact_pk_ct_sizes
-
-      - name: Parse key and ciphertext sizes results
-        if:  matrix.browser == 'chrome'
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/hlapi_cpk_and_cctl_sizes.csv "${RESULTS_FILENAME}" \
-          --key-gen \
-          --append-results
-
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
@@ -210,7 +198,7 @@ jobs:
          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (wasm-client-benchmarks)
+    name: benchmark_wasm_client/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, wasm-client-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -1,5 +1,5 @@
 # Run PKE Zero-Knowledge benchmarks on an instance and return parsed results to Slab CI bot.
-name: PKE ZK benchmarks
+name: benchmark_zk_pke

 on:
  workflow_dispatch:
@@ -36,6 +36,7 @@ permissions: {}

 jobs:
  should-run:
+    name: benchmark_zk_pke/should-run
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
@@ -51,7 +52,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            zk_pok:
@@ -67,7 +68,7 @@ jobs:
              - .github/workflows/zk_pke_benchmark.yml

  prepare-matrix:
-    name: Prepare operations matrix
+    name: benchmark_zk_pke/prepare-matrix
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -96,7 +97,7 @@ jobs:
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
-    name: Setup instance (pke-zk-benchmarks)
+    name: benchmark_zk_pke/setup-instance
    runs-on: ubuntu-latest
    needs: [ should-run, prepare-matrix ]
    if: github.event_name == 'workflow_dispatch' ||
@@ -119,7 +120,7 @@ jobs:
          profile: bench

  pke-zk-benchmarks:
-    name: Execute PKE ZK benchmarks
+    name: benchmark_zk_pke/pke-zk-benchmarks
    if: needs.setup-instance.result != 'skipped'
    needs: [ prepare-matrix, setup-instance ]
    concurrency:
@@ -223,7 +224,7 @@ jobs:
          SLACK_MESSAGE: "PKE ZK benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (pke-zk-benchmarks)
+    name: benchmark_zk_pke/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, pke-zk-benchmarks ]
    runs-on: ubuntu-latest
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -0,0 +1,40 @@
+# Run cargo audit
+on:
+  workflow_dispatch:
+  schedule:
+    # runs every day at 4am UTC
+    - cron: '0 4 * * *'
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+
+permissions: {}
+
+jobs:
+  audit:
+    name: cargo_audit/audit
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Audit dependencies
+        run: |
+          make audit_dependencies
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "cargo-audit finished with status: ${{ job.status }}. ([action run](${{ env.ACTION_RUN_URL }}))"
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -1,4 +1,4 @@
-name: Cargo Build TFHE-rs
+name: cargo_build

 on:
  pull_request:
@@ -19,6 +19,7 @@ permissions:

 jobs:
  cargo-builds:
+    name: cargo_build/cargo-builds (bpr)
    runs-on: ${{ matrix.os }}

    strategy:
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -1,5 +1,5 @@
 # Build tfhe-fft
-name: Cargo Build tfhe-fft
+name: cargo_build_tfhe_fft

 on:
  pull_request:
@@ -17,6 +17,7 @@ permissions:

 jobs:
  cargo-builds-fft:
+    name: cargo_build_tfhe_fft/cargo-builds-fft (bpr)
    runs-on: ${{ matrix.runner_type }}

    strategy:
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -1,5 +1,5 @@
 # Build tfhe-ntt
-name: Cargo Build tfhe-ntt
+name: cargo_build_tfhe_ntt

 on:
  pull_request:
@@ -17,6 +17,7 @@ permissions:

 jobs:
  cargo-builds-ntt:
+    name: cargo_build_tfhe_ntt/cargo-builds-ntt (bpr)
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -1,5 +1,5 @@
 # Test tfhe-fft
-name: Cargo Test tfhe-fft
+name: cargo_test_fft

 on:
  pull_request:
@@ -21,6 +21,7 @@ permissions:

 jobs:
  should-run:
+    name: cargo_test_fft/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -36,7 +37,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            fft:
@@ -46,6 +47,7 @@ jobs:
              - '.github/workflows/cargo_test_fft.yml'

  cargo-tests-fft:
+    name: cargo_test_fft/cargo-tests-fft
    needs: should-run
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ${{ matrix.runner_type }}
@@ -77,6 +79,7 @@ jobs:
          make test_fft_no_std

  cargo-tests-fft-nightly:
+    name: cargo_test_fft/cargo-tests-fft-nightly
    needs: should-run
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ${{ matrix.runner_type }}
@@ -104,6 +107,7 @@ jobs:
          make test_fft_no_std_nightly

  cargo-tests-fft-node-js:
+    name: cargo_test_fft/cargo-tests-fft-node-js
    needs: should-run
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
@@ -119,6 +123,7 @@ jobs:
          make test_fft_node_js_ci

  cargo-tests-fft-successful:
+    name: cargo_test_fft/cargo-tests-fft-successful (bpr)
    needs: [ should-run, cargo-tests-fft, cargo-tests-fft-nightly, cargo-tests-fft-node-js ]
    if: ${{ always() }}
    runs-on: ubuntu-latest
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -1,5 +1,5 @@
 # Test tfhe-ntt
-name: Cargo Test tfhe-ntt
+name: cargo_test_ntt

 on:
  pull_request:
@@ -22,6 +22,7 @@ permissions:

 jobs:
  should-run:
+    name: cargo_test_ntt/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -37,7 +38,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            ntt:
@@ -47,6 +48,7 @@ jobs:
              - '.github/workflows/cargo_test_ntt.yml'

  setup-instance:
+    name: cargo_test_ntt/setup-instance
    needs: should-run
    if: needs.should-run.outputs.ntt_test == 'true'
    runs-on: ubuntu-latest
@@ -75,6 +77,7 @@ jobs:
          echo "matrix_os=[\"${INSTANCE_TO_USE}\", \"macos-latest\", \"windows-latest\"]" >> "$GITHUB_OUTPUT"

  cargo-tests-ntt:
+    name: cargo_test_ntt/cargo-tests-ntt
    needs: [should-run, setup-instance]
    if: needs.should-run.outputs.ntt_test == 'true'
    runs-on: ${{ matrix.os }}
@@ -101,6 +104,7 @@ jobs:
        run: make test_ntt_no_std

  cargo-tests-ntt-nightly:
+    name: cargo_test_ntt/cargo-tests-ntt-nightly
    needs: [should-run, setup-instance]
    if: needs.should-run.outputs.ntt_test == 'true'
    runs-on: ${{ matrix.os }}
@@ -126,6 +130,7 @@ jobs:
        run: make test_ntt_no_std_nightly

  cargo-tests-ntt-successful:
+    name: cargo_test_ntt/cargo-tests-ntt-successful (bpr)
    needs: [should-run, cargo-tests-ntt, cargo-tests-ntt-nightly]
    if: ${{ always() }}
    runs-on: ubuntu-latest
@@ -151,7 +156,7 @@ jobs:
          exit 1

  teardown-instance:
-    name: Teardown instance (cargo-tests-ntt-successful)
+    name: cargo_test_ntt/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, cargo-tests-ntt-successful]
    runs-on: ubuntu-latest
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -1,5 +1,5 @@
 # Check commit and PR compliance
-name: Check commit and PR compliance
+name: check_commit
 on:
  pull_request:

@@ -7,7 +7,7 @@ permissions: {}

 jobs:
  check-commit-pr:
-    name: Check commit and PR
+    name: check_commit/check-commit-pr (bpr)
    runs-on: ubuntu-latest
    permissions:
      contents: read
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -1,5 +1,5 @@
 # Lint and check CI
-name: CI Lint and Checks
+name: ci_lint

 on:
  pull_request:
@@ -14,7 +14,7 @@ permissions:

 jobs:
  lint-check:
-    name: Lint and checks
+    name: ci_lint/lint-check (bpr)
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -1,4 +1,4 @@
-name: Code Coverage
+name: code_coverage

 env:
  CARGO_TERM_COLOR: always
@@ -22,7 +22,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (code-coverage)
+    name: code_coverage/setup-instance
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -38,8 +38,8 @@ jobs:
          backend: aws
          profile: cpu-small

-  code-coverage:
-    name: Code coverage tests
+  code-coverage-tests:
+    name: code_coverage/code-coverage-tests
    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}_${{ github.event_name }}
@@ -60,7 +60,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            tfhe:
@@ -90,7 +90,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@fdcc8476540edceab3de004e990f80d881c6cc00
+        uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -104,7 +104,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@fdcc8476540edceab3de004e990f80d881c6cc00
+        uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -121,9 +121,9 @@ jobs:
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (code-coverage)
+    name: code_coverage/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, code-coverage ]
+    needs: [ setup-instance, code-coverage-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
@@ -142,4 +142,4 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (code-coverage-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -1,8 +1,24 @@
 # Run all fhevm coprocessor benchmarks on a GPU instance on Hyperstack and return parsed results to Slab CI bot.
-name: Cuda Coprocessor benchmarks
+name: coprocessor-benchmark-gpu

 on:
  workflow_dispatch:
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
+          - "multi-h100-sxm5_fallback (n3-H100x8-SXM5)"
+
  schedule:
    # Weekly tests @ 1AM
    - cron: "0 1 * * 6"
@@ -17,7 +33,9 @@ env:
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  PROFILE: "multi-h100-sxm5 (n3-H100x8-SXM5)"
+  PROFILE_SCHEDULED_RUN: "multi-h100-sxm5 (n3-H100x8-SXM5)"
+  PROFILE_MANUAL_RUN: ${{ inputs.profile }}
+  IS_MANUAL_RUN: ${{ github.event_name == 'workflow_dispatch' }}
  BENCHMARK_TYPE: "ALL"
  OPTIMIZATION_TARGET: "throughput"
  BATCH_SIZE: "5000"
@@ -40,15 +58,25 @@ jobs:
      - name: Parse profile
        id: parse_profile
        run: |
+          if [[ ${IS_MANUAL_RUN} == true ]]; then
+            PROFILE_RAW="${PROFILE_MANUAL_RUN}"
+          else
+            PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
+          fi
          # shellcheck disable=SC2001
-          PROFILE_VAL=$(echo "${PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
          echo "profile=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"

      - name: Parse hardware name
        id: parse_hardware_name
        run: |
+          if [[ ${IS_MANUAL_RUN} == true ]]; then
+            PROFILE_RAW="${PROFILE_MANUAL_RUN}"
+          else
+            PROFILE_RAW="${PROFILE}"
+          fi
          # shellcheck disable=SC2001
-          PROFILE_VAL=$(echo "${PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
          echo "name=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -71,7 +99,7 @@ jobs:
          backend: hyperstack
          profile: ${{ needs.parse-inputs.outputs.profile }}

-  benchmark:
+  benchmark-gpu:
    name: coprocessor-benchmark-gpu/benchmark-gpu (bpr)
    needs: [ parse-inputs, setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
@@ -130,6 +158,13 @@ jobs:
          } >> "${GITHUB_ENV}"
        working-directory: tfhe-rs/

+      - name: Setup Hyperstack dependencies
+        uses: ./tfhe-rs/.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
      - name: Check fhEVM and TFHE-rs repos
        run: |
          pwd
@@ -140,13 +175,6 @@ jobs:
        run: git lfs checkout
        working-directory: fhevm/

-      - name: Setup Hyperstack dependencies
-        uses: ./fhevm/.github/actions/gpu_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
-
      - name: Install rust
        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
@@ -154,7 +182,7 @@ jobs:

      - name: Install cargo dependencies
        run: |
-          sudo apt-get install -y protobuf-compiler cmake pkg-config libssl-dev \
+          sudo apt-get install -y protobuf-compiler pkg-config libssl-dev \
                                  libclang-dev docker-compose-v2 docker.io acl
          sudo usermod -aG docker "$USER"
          newgrp docker
@@ -181,12 +209,19 @@ jobs:
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

+      - name: Login to Chainguard Registry
+        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
+        with:
+          registry: cgr.dev
+          username: ${{ secrets.CGR_USERNAME }}
+          password: ${{ secrets.CGR_PASSWORD }}
+
      - name: Init database
        run: make init_db
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
        with:
          node-version: 20.x

@@ -203,8 +238,12 @@ jobs:

      - name: Profile erc20 no-cmux benchmark on GPU
        run: |
-          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "profile_erc20_gpu"
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
+          FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
+          BENCHMARK_TYPE="THROUGHPUT_200" \
+          OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
+          make -e "profile_erc20_gpu"
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Get nsys profile name
        id: nsys_profile_name
@@ -215,7 +254,7 @@ jobs:
          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
        run: |
          mv report1.nsys-rep ${{ env.REPORT_NAME }}
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Upload profile artifact
        env:
@@ -223,17 +262,17 @@ jobs:
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
          name: ${{ env.REPORT_NAME }}
-          path: fhevm/coprocessor/fhevm-engine/coprocessor/${{ env.REPORT_NAME }}
+          path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}

      - name: Run latency benchmark on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Run throughput benchmarks on GPU
        run: |
          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="THROUGHPUT_200" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
-        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Parse results
        run: |
@@ -246,7 +285,7 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
-          --crate "coprocessor/fhevm-engine/coprocessor" \
+          --crate "coprocessor/fhevm-engine/tfhe-worker" \
          --name-suffix "operation_batch_size_${BATCH_SIZE}-schedule_${SCHEDULING_POLICY}-optimization_target_${OPTIMIZATION_TARGET}"
        working-directory: fhevm/

@@ -273,9 +312,9 @@ jobs:
          --slab-url "${SLAB_URL}"

  teardown-instance:
-    name: coprocessor-benchmark-gpu/teardown
+    name: coprocessor-benchmark-gpu/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, benchmark ]
+    needs: [ setup-instance, benchmark-gpu ]
    runs-on: ubuntu-latest
    permissions:
      contents: 'read'
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -1,4 +1,4 @@
-name: CSPRNG randomness testing Workflow
+name: csprng_randomness_tests

 env:
  CARGO_TERM_COLOR: always
@@ -26,7 +26,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (csprng-randomness-tests)
+    name: csprng_randomness_tests/setup-instance
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
@@ -52,7 +52,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  csprng-randomness-tests:
-    name: CSPRNG randomness tests
+    name: csprng_randomness_tests/csprng-randomness-tests
    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}
@@ -83,7 +83,7 @@ jobs:
          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (csprng-randomness-tests)
+    name: csprng_randomness_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, csprng-randomness-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an RTX 4090 machine
-name: Cuda - 4090 full tests
+name: gpu_4090_tests

 env:
  CARGO_TERM_COLOR: always
@@ -27,7 +27,7 @@ permissions:

 jobs:
  cuda-tests-linux:
-    name: CUDA tests (RTX 4090)
+    name: gpu_4090_tests/cuda-tests-linux
    if: github.event_name == 'workflow_dispatch' ||
      contains(github.event.label.name, '4090_test') ||
      (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs')
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - CPU Memory Checks
+name: gpu_code_validation_tests

 env:
  CARGO_TERM_COLOR: always
@@ -31,7 +31,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (cuda-tests)
+    name: gpu_code_validation_tests/setup-instance
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request' ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved')
@@ -58,7 +58,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA Memory Checks tests
+    name: gpu_code_validation_tests/cuda-tests-linux
    needs: [ setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -105,7 +105,7 @@ jobs:
          make test_high_level_api_gpu_valgrind

  slack-notify:
-    name: Slack Notification
+    name: gpu_code_validation_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -127,7 +127,7 @@ jobs:
          SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-tests)
+    name: gpu_code_validation_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: Cuda - Fast tests on H100
+name: gpu_fast_h100_tests

 env:
  CARGO_TERM_COLOR: always
@@ -30,6 +30,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_fast_h100_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -45,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -66,7 +67,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-h100-tests)
+    name: gpu_fast_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -108,7 +109,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA H100 tests
+    name: gpu_fast_h100_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -165,7 +166,7 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu

  slack-notify:
-    name: Slack Notification
+    name: gpu_fast_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -187,7 +188,7 @@ jobs:
          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-h100-tests)
+    name: gpu_fast_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Fast tests
+name: gpu_fast_tests

 env:
  CARGO_TERM_COLOR: always
@@ -29,6 +29,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_fast_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -44,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -65,7 +66,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-tests)
+    name: gpu_fast_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
      needs.should-run.outputs.gpu_test == 'true'
@@ -93,7 +94,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA tests
+    name: gpu_fast_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -151,7 +152,7 @@ jobs:
          make test_high_level_api_gpu

  slack-notify:
-    name: Slack Notification
+    name: gpu_fast_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -173,7 +174,7 @@ jobs:
          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-tests)
+    name: gpu_fast_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: Cuda - Full tests on H100
+name: gpu_full_h100_tests

 env:
  CARGO_TERM_COLOR: always
@@ -20,7 +20,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (cuda-h100-tests)
+    name: gpu_full_h100_tests/setup-instance
    runs-on: ubuntu-latest
    outputs:
      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
@@ -50,7 +50,7 @@ jobs:
          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA H100 tests
+    name: gpu_full_h100_tests/cuda-tests-linux
    needs: [ setup-instance ]
    concurrency:
      group: ${{ github.workflow_ref }}
@@ -102,7 +102,7 @@ jobs:
          make test_high_level_api_gpu

  slack-notify:
-    name: Slack Notification
+    name: gpu_full_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ failure() }}
@@ -115,7 +115,7 @@ jobs:
          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-h100-tests)
+    name: gpu_full_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Full tests multi-GPU
+name: gpu_full_multi_gpu_tests

 env:
  CARGO_TERM_COLOR: always
@@ -30,6 +30,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_full_multi_gpu_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -45,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -66,7 +67,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-tests-multi-gpu)
+    name: gpu_full_multi_gpu_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -85,7 +86,7 @@ jobs:
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: multi-gpu-test
+          profile: 4-l40

      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
@@ -95,7 +96,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA multi-GPU tests
+    name: gpu_full_multi_gpu_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -154,7 +155,7 @@ jobs:
          make test_high_level_api_gpu

  slack-notify:
-    name: Slack Notification
+    name: gpu_full_multi_gpu_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -176,7 +177,7 @@ jobs:
          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-tests-multi-gpu)
+    name: gpu_full_multi_gpu_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -1,4 +1,4 @@
-name: Cuda - Long Run Tests on GPU
+name: gpu_integer_long_run_tests

 env:
  CARGO_TERM_COLOR: always
@@ -27,7 +27,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (gpu-tests)
+    name: gpu_integer_long_run_tests/setup-instance
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
@@ -43,10 +43,10 @@ jobs:
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: multi-gpu-test
+          profile: 4-l40

  cuda-tests:
-    name: Long run GPU tests
+    name: gpu_integer_long_run_tests/cuda-tests
    needs: [ setup-instance ]
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
@@ -90,7 +90,7 @@ jobs:
          fi

  slack-notify:
-    name: Slack Notification
+    name: gpu_integer_long_run_tests/slack-notify
    needs: [ setup-instance, cuda-tests ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests.result != 'skipped' && failure() }}
@@ -103,7 +103,7 @@ jobs:
          SLACK_MESSAGE: "Integer GPU long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (gpu-tests)
+    name: gpu_integer_long_run_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - GPU Memory Checks
+name: gpu_memory_sanitizer

 env:
  CARGO_TERM_COLOR: always
@@ -30,7 +30,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (cuda-tests)
+    name: gpu_memory_sanitizer/setup-instance
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request' ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved')
@@ -57,7 +57,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA Memory Checks tests
+    name: gpu_memory_sanitizer/cuda-tests-linux
    needs: [ setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -102,7 +102,7 @@ jobs:
          make test_high_level_api_gpu_sanitizer

  slack-notify:
-    name: Slack Notification
+    name: gpu_memory_sanitizer/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -124,7 +124,7 @@ jobs:
          SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-tests)
+    name: gpu_memory_sanitizer/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -1,5 +1,5 @@
 # Perform tfhe-cuda-backend post-commit checks on an AWS instance
-name: Cuda - Post-commit Checks
+name: gpu_pcc

 env:
  CARGO_TERM_COLOR: always
@@ -28,7 +28,7 @@ permissions:

 jobs:
  setup-instance:
-    name: Setup instance (cuda-pcc)
+    name: gpu_pcc/setup-instance
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -53,7 +53,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-pcc:
-    name: CUDA post-commit checks
+    name: gpu_pcc/cuda-pcc (bpr)
    needs: setup-instance
    concurrency:
      group: ${{ github.workflow_ref }}
@@ -149,7 +149,7 @@ jobs:
          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-pcc)
+    name: cuda_pcc/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-pcc ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -1,5 +1,5 @@
 # Signed integer GPU tests on an RTXA6000 VM on hyperstack with classical PBS
-name: Cuda - Signed integer tests with classical PBS
+name: gpu_signed_integer_classic_tests

 env:
  CARGO_TERM_COLOR: always
@@ -30,6 +30,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_signed_integer_classic_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -45,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -66,7 +67,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-signed-classic-tests)
+    name: gpu_signed_integer_classic_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -95,7 +96,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA signed integer tests with classical PBS
+    name: gpu_signed_integer_classic_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -137,7 +138,7 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci

  slack-notify:
-    name: Slack Notification
+    name: gpu_signed_integer_classic_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -159,7 +160,7 @@ jobs:
          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-signed-classic-tests)
+    name: gpu_signed_integer_classic_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -1,5 +1,5 @@
 # Signed integer GPU tests on an H100 VM on hyperstack
-name: Cuda - Signed integer tests on H100
+name: gpu_signed_integer_h100_tests

 env:
  CARGO_TERM_COLOR: always
@@ -30,6 +30,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_signed_integer_h100_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -45,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -66,7 +67,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-h100-tests)
+    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -108,7 +109,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA H100 signed integer tests
+    name: gpu_signed_integer_h100_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -151,7 +152,7 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci

  slack-notify:
-    name: Slack Notification
+    name: gpu_signed_integer_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -173,7 +174,7 @@ jobs:
          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-h100-tests)
+    name: gpu_signed_integer_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend signed integer on an AWS instance
-name: Cuda - Signed integer tests
+name: gpu_signed_integer_tests

 env:
  CARGO_TERM_COLOR: always
@@ -31,6 +31,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_signed_integer_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -46,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -67,7 +68,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-signed-integer-tests)
+    name: gpu_signed_integer_tests/setup-instance
    runs-on: ubuntu-latest
    needs: should-run
    if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
@@ -96,7 +97,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-signed-integer-tests:
-    name: CUDA signed integer tests
+    name: gpu_signed_integer_tests/cuda-signed-integer-tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -146,7 +147,7 @@ jobs:
          make test_signed_integer_multi_bit_gpu_ci

  slack-notify:
-    name: Slack Notification
+    name: gpu_signed_integer_tests/slack-notify
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' && failure() }}
@@ -168,7 +169,7 @@ jobs:
          SLACK_MESSAGE: "Signed GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-tests)
+    name: gpu_signed_integer_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -1,5 +1,5 @@
 # Test unsigned integers on an RTXA6000 VM on hyperstack with the classical PBS
-name: Cuda - Unsigned integer tests with classical PBS
+name: gpu_unsigned_integer_classic_tests

 env:
  CARGO_TERM_COLOR: always
@@ -30,6 +30,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_unsigned_integer_classic_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -45,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -66,7 +67,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-unsigned-classic-tests)
+    name: gpu_unsigned_integer_classic_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -95,7 +96,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA unsigned integer tests with classical PBS
+    name: gpu_unsigned_integer_classic_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -137,7 +138,7 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci

  slack-notify:
-    name: Slack Notification
+    name: gpu_unsigned_integer_classic_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -159,7 +160,7 @@ jobs:
          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-unsigned-classic-tests)
+    name: gpu_unsigned_integer_classic_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -1,5 +1,5 @@
 # Test unsigned integers on an H100 VM on hyperstack
-name: Cuda - Unsigned integer tests on H100
+name: gpu_unsigned_integer_h100_tests/

 env:
  CARGO_TERM_COLOR: always
@@ -30,6 +30,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_unsigned_integer_h100_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -45,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -66,7 +67,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-h100-tests)
+    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -108,7 +109,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: CUDA H100 unsigned integer tests
+    name: gpu_unsigned_integer_h100_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -151,7 +152,7 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci

  slack-notify:
-    name: Slack Notification
+    name: gpu_unsigned_integer_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -173,7 +174,7 @@ jobs:
          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-h100-tests)
+    name: gpu_unsigned_integer_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend unsigned integer on an AWS instance
-name: Cuda - Unsigned integer tests
+name: gpu_unsigned_integer_tests

 env:
  CARGO_TERM_COLOR: always
@@ -31,6 +31,7 @@ permissions:

 jobs:
  should-run:
+    name: gpu_unsigned_integer_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -46,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -67,7 +68,7 @@ jobs:
              - ci/slab.toml

  setup-instance:
-    name: Setup instance (cuda-unsigned-integer-tests)
+    name: gpu_unsigned_integer_tests/setup-instance
    runs-on: ubuntu-latest
    needs: should-run
    if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
@@ -96,7 +97,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-unsigned-integer-tests:
-    name: CUDA unsigned integer tests
+    name: gpu_unsigned_integer_tests/cuda-unsigned-integer-tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -146,7 +147,7 @@ jobs:
          make test_unsigned_integer_multi_bit_gpu_ci

  slack-notify:
-    name: Slack Notification
+    name: gpu_unsigned_integer_tests/slack-notify
    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' && failure() }}
@@ -168,7 +169,7 @@ jobs:
          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: Teardown instance (cuda-tests)
+    name: gpu_unsigned_integer_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -1,5 +1,5 @@
-# Test tfhe-fft
-name: Cargo Test HLAPI HPU
+# Test HPU backend HLAPI layer
+name: hpu_hlapi_tests

 on:
  pull_request:
@@ -21,6 +21,7 @@ permissions: { }

 jobs:
  should-run:
+    name: hpu_hlapi_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read
@@ -36,7 +37,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            hpu:
@@ -46,6 +47,7 @@ jobs:
              - mockups/tfhe-hpu-mockup/**

  cargo-tests-hpu:
+    name: hpu_hlapi_tests/cargo-tests-hpu (bpr)
    needs: should-run
    if: needs.should-run.outputs.hpu_test == 'true'
    runs-on: large_ubuntu_16
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -1,4 +1,4 @@
-name: AWS Long Run Tests on CPU
+name: integer_long_run_tests

 env:
  CARGO_TERM_COLOR: always
@@ -23,7 +23,7 @@ permissions: {}

 jobs:
  setup-instance:
-    name: Setup instance (cpu-tests)
+    name: integer_long_run_tests/setup-instance
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
@@ -42,7 +42,7 @@ jobs:
          profile: cpu-big

  cpu-tests:
-    name: Long run CPU tests
+    name: integer_long_run_tests/cpu-tests
    needs: [ setup-instance ]
    concurrency:
      group: ${{ github.workflow_ref }}_${{github.event_name}}
@@ -74,7 +74,7 @@ jobs:
          SLACK_MESSAGE: "CPU long run tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cpu-tests)
+    name: integer_long_run_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cpu-tests ]
    runs-on: ubuntu-latest
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -1,4 +1,4 @@
-name: Tests on M1 CPU
+name: m1_tests

 on:
  workflow_dispatch:
@@ -32,6 +32,7 @@ permissions:

 jobs:
  cargo-builds-m1:
+    name: m1_tests/cargo-builds-m1
    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') ||
      github.event_name == 'workflow_dispatch' ||
      contains(github.event.label.name, 'm1_test') }}
@@ -178,7 +179,7 @@ jobs:
          make test_integer_multi_bit_ci

  remove_label:
-    name: Remove m1_test label
+    name: m1_tests/remove_label
    runs-on: ubuntu-latest
    needs:
      - cargo-builds-m1
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -1,5 +1,5 @@
 # Publish new release of tfhe-rs on various platform.
-name: Publish release
+name: make_release

 on:
  workflow_dispatch:
@@ -36,15 +36,18 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  package:
+    name: make_release/package
    runs-on: ubuntu-latest
-    needs: verify_tag
+    needs: verify-tag
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
@@ -66,6 +69,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
+    name: make_release/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
@@ -81,7 +85,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish_release:
-    name: Publish Release
+    name: make_release/publish_release
    needs: [package] # for comparing hashes
    runs-on: ubuntu-latest
    # For provenance of npmjs publish
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -1,4 +1,4 @@
-name: Publish CUDA release
+name: make_release_cuda

 on:
  workflow_dispatch:
@@ -18,15 +18,17 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release_cuda/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  setup-instance:
-    name: Setup instance (publish-cuda-release)
-    needs: verify_tag
+    name: make_release_cuda/setup-instance
+    needs: verify-tag
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -43,7 +45,7 @@ jobs:
          profile: gpu-build

  package:
-    name: Package CUDA Release for provenance
+    name: make_release_cuda/package
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    outputs:
@@ -104,6 +106,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
+    name: make_release_cuda/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
@@ -119,7 +122,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish-cuda-release:
-    name: Publish CUDA Release
+    name: make_release_cuda/publish-cuda-release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    permissions:
@@ -201,7 +204,7 @@ jobs:
          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (publish-release)
+    name: make_release_cuda/teardown-instance
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, publish-cuda-release]
    runs-on: ubuntu-latest
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -1,4 +1,4 @@
-name: Publish HPU release
+name: make_release_hpu

 on:
  workflow_dispatch:
@@ -18,15 +18,18 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release_hpu/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  package:
+    name: make_release_hpu/package
    runs-on: ubuntu-latest
-    needs: verify_tag
+    needs: verify-tag
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
@@ -48,6 +51,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
+    name: make_release_hpu/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
@@ -63,9 +67,9 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish_release:
-    name: Publish tfhe-hpu-backend Release
+    name: make_release_hpu/publish-release
    runs-on: ubuntu-latest
-    needs: [verify_tag, package] # for comparing hashes
+    needs: [verify-tag, package] # for comparing hashes
    permissions:
      # Needed for OIDC token exchange on crates.io
      id-token: write
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -1,4 +1,4 @@
-name: Publish tfhe-csprng release
+name: make_release_tfhe_csprng

 on:
  workflow_dispatch:
@@ -18,13 +18,16 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release_tfhe_csprng/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  package:
+    name: make_release_tfhe_csprng/package
    runs-on: ubuntu-latest
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
@@ -48,6 +51,7 @@ jobs:


  provenance:
+    name: make_release_tfhe_csprng/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
@@ -64,8 +68,8 @@ jobs:


  publish_release:
-    name: Publish tfhe-csprng Release
-    needs: [verify_tag, package]
+    name: make_release_tfhe_csprng/publish-release
+    needs: [verify-tag, package]
    runs-on: ubuntu-latest
    permissions:
      # Needed for OIDC token exchange on crates.io
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -1,5 +1,5 @@
 # Publish new release of tfhe-fft
-name: Publish tfhe-fft release
+name: make_release_tfhe_fft

 on:
  workflow_dispatch:
@@ -19,15 +19,18 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release_tfhe_fft/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  package:
+    name: make_release_tfhe_fft/package
    runs-on: ubuntu-latest
-    needs: verify_tag
+    needs: verify-tag
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
@@ -49,6 +52,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
+    name: make_release_tfhe_fft/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
@@ -64,9 +68,9 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish_release:
-    name: Publish tfhe-fft Release
+    name: make_release_tfhe_fft/publish-release
    runs-on: ubuntu-latest
-    needs: [verify_tag, package] # for comparing hashes
+    needs: [verify-tag, package] # for comparing hashes
    permissions:
      # Needed for OIDC token exchange on crates.io
      id-token: write
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -1,5 +1,5 @@
 # Publish new release of tfhe-ntt
-name: Publish tfhe-ntt release
+name: make_release_tfhe_ntt

 on:
  workflow_dispatch:
@@ -19,15 +19,18 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release_tfhe_ntt/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  package:
+    name: make_release_tfhe_ntt/package
    runs-on: ubuntu-latest
-    needs: verify_tag
+    needs: verify-tag
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
@@ -49,6 +52,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
+    name: make_release_tfhe_ntt/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
@@ -64,9 +68,9 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish_release:
-    name: Publish tfhe-ntt Release
+    name: make_release_tfhe_ntt/publish-release
    runs-on: ubuntu-latest
-    needs: [verify_tag, package] # for comparing hashes
+    needs: [verify-tag, package] # for comparing hashes
    permissions:
      # Needed for OIDC token exchange on crates.io
      id-token: write
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -1,4 +1,4 @@
-name: Publish tfhe-versionable release
+name: make_release_tfhe_versionable

 on:
  workflow_dispatch:
@@ -13,14 +13,16 @@ env:
 permissions: {}

 jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
+  verify-tag:
+    name: make_release_tfhe_versionable/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  package-derive:
-    name: Package tfhe-versionable-derive Release
+    name: make_release_tfhe_versionable/package-derive
    runs-on: ubuntu-latest
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
@@ -43,6 +45,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance-derive:
+    name: make_release_tfhe_versionable/provenance-derive
    needs: [package-derive]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
@@ -57,8 +60,8 @@ jobs:
      base64-subjects: ${{ needs.package-derive.outputs.hash }}

  publish_release-derive:
-    name: Publish tfhe-versionable-derive Release
-    needs: [ verify_tag, package-derive ] # for comparing hashes
+    name: make_release_tfhe_versionable/publish_release_derive
+    needs: [ verify-tag, package-derive ] # for comparing hashes
    runs-on: ubuntu-latest
    permissions:
      # Needed for OIDC token exchange on crates.io
@@ -102,7 +105,7 @@ jobs:
          SLACK_MESSAGE: "tfhe-versionable-derive release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  package:
-    name: Package tfhe-versionable Release
+    name: make_release_tfhe_versionable/package
    needs: publish_release-derive
    runs-on: ubuntu-latest
    outputs:
@@ -126,6 +129,7 @@ jobs:
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

  provenance:
+    name: make_release_tfhe_versionable/provenance
    needs: package
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
@@ -140,7 +144,7 @@ jobs:
      base64-subjects: ${{ needs.package.outputs.hash }}

  publish_release:
-    name: Publish tfhe-versionable Release
+    name: make_release_tfhe_versionable/publish-release
    needs: package # for comparing hashes
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -1,4 +1,4 @@
-name: Publish tfhe-zk-pok release
+name: make_release_zk_pok

 on:
  workflow_dispatch:
@@ -15,33 +15,45 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

-permissions: {}
+permissions: { }

 jobs:
+  verify-tag:
+    name: make_release_zk_pok/verify-tag
+    if: startsWith(github.ref, 'refs/tags/')
+    uses: ./.github/workflows/verify_commit_actor.yml
+    secrets:
+      ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
  package:
-      runs-on: ubuntu-latest
-      outputs:
-        hash: ${{ steps.hash.outputs.hash }}
-      steps:
-        - name: Checkout
-          uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
-          with:
-            fetch-depth: 0
-            persist-credentials: 'false'
-            token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-        - name: Prepare package
-          run: |
-            cargo package -p tfhe-zk-pok
-        - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-          with:
-            name: crate-zk-pok
-            path: target/package/*.crate
-        - name: generate hash
-          id: hash
-          run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+    name: make_release_zk_pok/package
+    runs-on: ubuntu-latest
+    needs: verify-tag
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-zk-pok
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: crate-zk-pok
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
  provenance:
+    name: make_release_zk_pok/provenance
    if: ${{ !inputs.dry_run  }}
-    needs: [package]
+    needs: [ package ]
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      # Needed to detect the GitHub Actions environment
@@ -54,15 +66,9 @@ jobs:
      # SHA-256 hashes of the Crate package.
      base64-subjects: ${{ needs.package.outputs.hash }}

-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
-    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
-      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
-
  publish_release:
-    name: Publish tfhe-zk-pok Release
-    needs: [verify_tag, package] # for comparing hashes
+    name: make_release_zk_pok/publish-release
+    needs: [ verify-tag, package ] # for comparing hashes
    runs-on: ubuntu-latest
    permissions:
      # Needed for OIDC token exchange on crates.io
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -1,5 +1,5 @@
 # Perform a security check on all the cryptographic parameters set
-name: Parameters curves security check
+name: parameters_check

 env:
  CARGO_TERM_COLOR: always
@@ -16,6 +16,7 @@ permissions: {}

 jobs:
  params-curves-security-check:
+    name: parameters_check/params-curves-security-check
    runs-on: large_ubuntu_16-22.04
    steps:
      - name: Checkout tfhe-rs
@@ -29,7 +30,7 @@ jobs:
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
-          ref: '52f4b7a99ae7b5dfd088c5c295070bd38ff0d1e0'
+          ref: 'e35f45b7976a90a79c3c6625a45bbc344c1abc67'
          persist-credentials: 'false'

      - name: Install Sage
--- a/.github/workflows/placeholder_workflow.yml
+++ b/.github/workflows/placeholder_workflow.yml
@@ -1,5 +1,5 @@
 # Placeholder workflow file allowing running it without having to merge to main first
-name: Placeholder Workflow
+name: placeholder_workflow

 on:
  workflow_dispatch:
@@ -8,7 +8,7 @@ permissions: {}

 jobs:
  placeholder:
-    name: Placeholder
+    name: placeholder_workflow/placeholder
    runs-on: ubuntu-latest

    steps:
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -1,5 +1,5 @@
 # Sync repos
-name: Sync repos
+name: sync_on_push

 on:
  push:
@@ -7,30 +7,62 @@ on:
      - 'main'
  workflow_dispatch:

-permissions: {}
+permissions: { }

 jobs:
  sync-repo:
+    name: sync_on_push/sync-repo
    if: ${{ github.repository == 'zama-ai/tfhe-rs' }}
    runs-on: ubuntu-latest
    steps:
-      - name: Checkout repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: git-sync
-        uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
-        with:
-          source_repo: "zama-ai/tfhe-rs"
-          source_branch: "main"
-          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
-          destination_branch: "main"
-      - name: git-sync tags
-        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
-        with:
-          source_repo: "zama-ai/tfhe-rs"
-          source_branch: "refs/tags/*"
-          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
-          destination_branch: "refs/tags/*"
+        env:
+          SOURCE_REPO: "zama-ai/tfhe-rs"
+          SOURCE_BRANCH: "main"
+          DESTINATION_BRANCH: "main"
+          USERNAME: ${{ secrets.BOT_USERNAME }}
+          TOKEN: ${{ secrets.SYNC_REPO_TOKEN }}
+          DEST_REPO: ${{ secrets.SYNC_DEST_REPO }}
+        run: |
+          echo ">>> Cloning source repo..."
+          git lfs install
+          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
+          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
+
+          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
+          git fetch source '+refs/heads/*:refs/heads/*' --update-head-ok
+
+          echo ">>> Print out all branches"
+          git --no-pager branch -a -vv
+
+          echo ">>> Fetching all LFS items from source..."
+          git lfs fetch --all source "${SOURCE_BRANCH}"
+
+          echo ">>> Pushing git changes..."
+          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
+
+          echo ">>> Pushing all LFS items..."
+          git lfs push --all destination "${DESTINATION_BRANCH}"
+
+      - name: git-sync-tags
+        env:
+          SOURCE_REPO: "zama-ai/tfhe-rs"
+          SOURCE_BRANCH: "refs/tags/*"
+          DESTINATION_BRANCH: "refs/tags/*"
+          USERNAME: ${{ secrets.BOT_USERNAME }}
+          TOKEN: ${{ secrets.SYNC_REPO_TOKEN }}
+          DEST_REPO: ${{ secrets.SYNC_DEST_REPO }}
+        run: |
+          echo ">>> Cloning source repo..."
+          git lfs install
+          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
+          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
+
+          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
+          git fetch source '+refs/heads/*:refs/heads/*' --update-head-ok
+
+          echo ">>> Print out all branches"
+          git --no-pager branch -a -vv
+
+          echo ">>> Pushing git changes..."
+          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -1,4 +1,5 @@
-name: 'Close unverified PRs'
+# Close unverified PRs'
+name: unverified_prs
 on:
  schedule:
    - cron: '30 1 * * *'
@@ -7,12 +8,13 @@ permissions: {}

 jobs:
  stale:
+    name: unverified_prs/stale
    runs-on: ubuntu-latest
    permissions:
      issues: read
      pull-requests: write
    steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
        with:
          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
          days-before-stale: 2
--- a/.github/workflows/verify_tagged_commit.yml
+++ b/.github/workflows/verify_tagged_commit.yml
@@ -1,10 +1,10 @@
-# Verify a tagged commit
-name: Verify tagged commit
+# Verify a commit actor
+name: verify_commit_actor

 on:
  workflow_call:
    secrets:
-      RELEASE_TEAM:
+      ALLOWED_TEAM:
        required: true
      READ_ORG_TOKEN:
        required: true
@@ -12,9 +12,9 @@ on:
 permissions: {}

 jobs:
-  checks:
+  check-actor:
+    name: verify_commit_actor/check-actor
    runs-on: ubuntu-latest
-    if: startsWith(github.ref, 'refs/tags/')
    steps:
      # Check triggering actor membership
      - name: Actor verification
@@ -23,7 +23,7 @@ jobs:
        with:
          username: ${{ github.triggering_actor }}
          org: ${{ github.repository_owner }}
-          team: ${{ secrets.RELEASE_TEAM }}
+          team: ${{ secrets.ALLOWED_TEAM }}
          github_token: ${{ secrets.READ_ORG_TOKEN }}

      - name: Actor authorized
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,7 @@ pulp = { version = "0.21", default-features = false }
 rand = "0.8"
 rayon = "1.11"
 serde = { version = "1.0", default-features = false }
-wasm-bindgen = "0.2.100"
+wasm-bindgen = "0.2.101"
 getrandom = "0.2.8"

 [profile.bench]
--- a/48
+++ b/48
@@ -21,11 +21,11 @@ BENCH_OP_FLAVOR?=DEFAULT
 BENCH_TYPE?=latency
 BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
+BENCH_CUSTOM_COMMAND:=
 NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
 WASM_PACK_VERSION="0.13.1"
-# We are kind of hacking the cut here, the version cannot contain a quote '"'
-WASM_BINDGEN_VERSION:=$(shell grep '^wasm-bindgen[[:space:]]*=' Cargo.toml | cut -d '"' -f 2 | xargs)
+WASM_BINDGEN_VERSION:=$(shell cargo tree --target wasm32-unknown-unknown -e all --prefix none | grep "wasm-bindgen v" | head -n 1 | cut -d 'v' -f2)
 WEB_RUNNER_DIR=web-test-runner
 WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
@@ -114,10 +114,6 @@ install_cargo_nextest: install_rs_build_toolchain
 	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-nextest --locked || \
 	( echo "Unable to install cargo nextest, unknown error." && exit 1 )

-# The installation should use the ^ symbol if the specified version in the root Cargo.toml is of the
-# form "0.2.96" then we get ^0.2.96 e.g., as we don't lock those dependencies
-# this allows to get the matching CLI
-# If a version range is specified no need to add the leading ^
 .PHONY: install_wasm_bindgen_cli # Install wasm-bindgen-cli to get access to the test runner
 install_wasm_bindgen_cli: install_rs_build_toolchain
 	cargo +$(RS_BUILD_TOOLCHAIN) install --locked wasm-bindgen-cli --version "$(WASM_BINDGEN_VERSION)"
@@ -160,9 +156,13 @@ install_tarpaulin: install_rs_build_toolchain
 	( echo "Unable to install cargo tarpaulin, unknown error." && exit 1 )

 .PHONY: install_cargo_dylint # Install custom tfhe-rs lints
-install_cargo_dylint:
+install_cargo_dylint: install_rs_build_toolchain
 	cargo install --locked cargo-dylint dylint-link

+.PHONY: install_cargo_audit # Check dependencies
+install_cargo_audit: install_rs_build_toolchain
+	cargo install --locked cargo-audit
+
 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker: install_rs_build_toolchain
 	@typos --version > /dev/null 2>&1 || \
@@ -545,6 +545,10 @@ tfhe_lints: install_cargo_dylint
 	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
 		--features=experimental

+.PHONY: audit_dependencies # Run cargo audit to check vulnerable dependencies
+audit_dependencies: install_rs_build_toolchain install_cargo_audit
+	cargo audit
+

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -693,7 +697,7 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key::
+		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4

 .PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
 test_integer_gpu_debug: install_rs_build_toolchain
@@ -990,11 +994,34 @@ test_high_level_api: install_rs_build_toolchain
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

+test_high_level_api_gpu_one: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test  \
+		--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+		-- --nocapture high_level_api::array::tests::booleans::test_gpu_only_bitand
+
+#
+test_high_level_api_gpu_mul:  install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN)  test  \
+        --features=integer,internal-keycache,gpu-debug --profile release \
+        -p tfhe  \
+        -- --nocapture integer::gpu::server_key::radix::tests_unsigned::test_mul:: \
+        --test-threads=6
+
 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
 		-E "test(/high_level_api::.*gpu.*/)"

+test_list_gpu: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest list --cargo-profile $(CARGO_PROFILE) \
+		--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+		-E "test(/.*gpu.*/)"
+
+.PHONY: build_one_hl_api_test_gpu
+build_one_hl_api_test_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
+		   --features=integer,gpu-debug -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture
+
 test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
 ifeq ($(HPU_CONFIG), v80)
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
@@ -1556,6 +1583,11 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --

+
+.PHONY: bench_custom # Run benchmarks with a user-defined command
+bench_custom: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-benchmark $(BENCH_CUSTOM_COMMAND)
+
 #
 # Utility tools
 #
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -86,6 +86,7 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
  message("Compiling in Debug mode")
  add_definitions(-DDEBUG)
  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
+  set(USE_NVTOOLS 1)
 else()
  # Release mode
  message("Compiling in Release mode")
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -4,10 +4,19 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <cuda_runtime.h>
+#include <fstream>
 #include <vector>

+#define CUDA_STREAM_POOL
+
+enum CudaStreamType
+{
+  KEY = 0,
+  ALLOC = 1,
+  TEMP_HELPER = 2,
+};
+
 extern "C" {

 #define check_cuda_error(ans)                                                  \
@@ -76,13 +85,27 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
 uint32_t cuda_is_available();

 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
+void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index);

-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
-                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
-                                           bool allocate_gpu_memory);
+void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
+                                                  cudaStream_t stream,
+                                                  uint32_t gpu_index,
+                                                  uint64_t &size_tracker,
+                                                  bool allocate_gpu_memory,
+                                                  const char *file, int line);

-void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
+#define cuda_malloc_with_size_tracking_async(                                  \
+    size, stream, gpu_index, size_tracker, allocate_gpu_memory)                \
+  cuda_intern_malloc_with_size_tracking_async(                                 \
+      size, stream, gpu_index, size_tracker, allocate_gpu_memory, __FILE__,    \
+      __LINE__)
+
+void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index, const char *file, int line);
+#define cuda_malloc_async(size, stream, gpu_index)                             \
+  cuda_int_malloc_async(size, stream, gpu_index, __FILE__, __LINE__)
+void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index);

 bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 uint64_t cuda_device_total_memory(uint32_t gpu_index);
@@ -96,18 +119,28 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

+void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index);
+
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
    uint32_t gpu_index, bool gpu_memory_allocated);

 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);
+void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
+                                      uint64_t size, cudaStream_t stream,
+                                      uint32_t gpu_index);

 void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index);
+void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
+                                uint32_t gpu_index);

 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);
+void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index);

 void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          uint64_t size, cudaStream_t stream,
@@ -116,6 +149,8 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,

 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index);
+void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
+                           cudaStream_t stream, uint32_t gpu_index);

 int cuda_get_number_of_gpus();

@@ -123,13 +158,26 @@ int cuda_get_number_of_sms();

 void cuda_synchronize_device(uint32_t gpu_index);

-void cuda_drop(void *ptr, uint32_t gpu_index);
+void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line);
+#define cuda_drop(ptr, gpu_index)                                              \
+  cuda_int_drop(ptr, gpu_index, __FILE__, __LINE__)

-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
-                                        uint32_t gpu_index,
-                                        bool gpu_memory_allocated);
+void cuda_ext_drop(void *ptr, uint32_t gpu_index);

-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+                                            uint32_t gpu_index,
+                                            bool gpu_memory_allocated,
+                                            const char *file, int line);
+
+#define cuda_drop_with_size_tracking_async(ptr, stream, gpu_index,             \
+                                           gpu_memory_allocated)               \
+  cuda_int_drop_with_size_tracking_async(                                      \
+      ptr, stream, gpu_index, gpu_memory_allocated, __FILE__, __LINE__)
+
+void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
+                         const char *file, int line);
+#define cuda_drop_async(ptr, stream, gpu_index)                                \
+  cuda_int_drop_async(ptr, stream, gpu_index, __FILE__, __LINE__)
 }

 uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
@@ -141,4 +189,5 @@ bool cuda_check_support_thread_block_clusters();
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                          Torus *d_array, Torus value, Torus n);
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -4,6 +4,8 @@
 #include <variant>
 #include <vector>

+#include "integer/integer.h"
+
 extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU;
@@ -37,10 +39,149 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
  }
 }

-int get_active_gpu_count(int num_inputs, int gpu_count);
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

 int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);

+// A Set of GPU Streams and associated GPUs
+// Can be constructed from the FFI struct CudaStreamsFFI which
+// is only used to pass the streams/gpus at the rust/C interface
+// This class should only be constructed from the FFI struct,
+// through class methods or through the copy constructor. The class
+// can also be constructed as an empty set
+struct CudaStreams {
+private:
+  cudaStream_t const *_streams;
+  uint32_t const *_gpu_indexes;
+  uint32_t _gpu_count;
+  bool _owns_streams;
+
+  // Prevent the construction of a CudaStreams class from user-code
+  CudaStreams(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+              uint32_t gpu_count)
+      : _streams(streams), _gpu_indexes(gpu_indexes), _gpu_count(gpu_count),
+        _owns_streams(false) {}
+
+public:
+  // Construct an empty set. Invalid use of an empty set should raise an error
+  // right away through asserts or because of a nullptr dereference
+  CudaStreams()
+      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
+        _owns_streams(false) {}
+
+  // Returns a subset of this set as an active subset. An active subset is one
+  // that is temporarily used to perform some computation
+  CudaStreams active_gpu_subset(int num_radix_blocks) {
+    return CudaStreams(_streams, _gpu_indexes,
+                       get_active_gpu_count(num_radix_blocks, _gpu_count));
+  }
+
+  // Returns a subset containing only the first gpu of this set. It
+  // is used to create subset of streams for mono-GPU functions
+  CudaStreams subset_first_gpu() const {
+    return CudaStreams(_streams, _gpu_indexes, 1);
+  }
+
+  // Synchronize all the streams in the set
+  void synchronize() const {
+    for (uint32_t i = 0; i < _gpu_count; i++) {
+      cuda_synchronize_stream(_streams[i], _gpu_indexes[i]);
+    }
+  }
+
+  cudaStream_t stream(uint32_t idx) const {
+    PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
+    return _streams[idx];
+  }
+  uint32_t gpu_index(uint32_t idx) const {
+    PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
+    return _gpu_indexes[idx];
+  }
+  uint32_t count() const { return _gpu_count; }
+
+  // Construct from the rust FFI stream set. Streams are created in rust
+  // using the bindings.
+  CudaStreams(CudaStreamsFFI &ffi)
+      : _streams((cudaStream_t *)ffi.streams), _gpu_indexes(ffi.gpu_indexes),
+        _gpu_count(ffi.gpu_count), _owns_streams(false) {}
+
+  // Create a new set of streams on the same gpus as those of the current stream
+  // set Can be used to parallelize computation by issuing kernels on multiple
+  // streams on the same GPU
+  void create_on_same_gpus(const CudaStreams &other) {
+    PANIC_IF_FALSE(_streams == nullptr,
+                   "Assign clone to non-empty cudastreams");
+
+    cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
+
+    uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
+    for (uint32_t i = 0; i < other._gpu_count; ++i) {
+      new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
+      gpu_indexes_clone[i] = other._gpu_indexes[i];
+    }
+
+    this->_streams = new_streams;
+    this->_gpu_indexes = gpu_indexes_clone;
+    this->_gpu_count = other._gpu_count;
+
+    // Flag this instance as owning streams so that we can destroy
+    // the streams when they aren't needed anymore
+    this->_owns_streams = true;
+  }
+
+  // Copy constructor, setting the own flag to false
+  // Only the initial instance of CudaStreams created with
+  // assign_clone owns streams, all copies of it do not own the
+  // streams
+  CudaStreams(const CudaStreams &src)
+      : _streams(src._streams), _gpu_indexes(src._gpu_indexes),
+        _gpu_count(src._gpu_count), _owns_streams(false) {}
+
+  CudaStreams &operator=(CudaStreams const &other) {
+    PANIC_IF_FALSE(this->_streams == nullptr ||
+                       this->_streams == other._streams,
+                   "Assigning an already initialized CudaStreams");
+    this->_streams = other._streams;
+    this->_gpu_indexes = other._gpu_indexes;
+    this->_gpu_count = other._gpu_count;
+
+    // Only the initial instance of CudaStreams created with
+    // assign_clone owns streams, all copies of it do not own the
+    // streams
+    this->_owns_streams = false;
+    return *this;
+  }
+
+  // Destroy the streams if they are created by assign_clone.
+  // We require the developer to call `destroy` on all instances
+  // of cloned streams.
+  void release() {
+    // If this instance doesn't own streams, there's nothing to do
+    // as the streams were created on the Rust side.
+    if (_owns_streams) {
+      for (uint32_t i = 0; i < _gpu_count; ++i) {
+        cuda_destroy_stream(_streams[i], _gpu_indexes[i]);
+      }
+      delete[] _streams;
+      _streams = nullptr;
+      delete[] _gpu_indexes;
+      _gpu_indexes = nullptr;
+    }
+  }
+
+  // The destructor checks that streams created with assign_clone
+  // were destroyed manually with `destroy`.
+  ~CudaStreams() {
+    // Ensure streams are destroyed
+    PANIC_IF_FALSE(
+        !_owns_streams || _streams == nullptr,
+        "Destroy  (this=%p) was not called on a CudaStreams object that "
+        "is a clone "
+        "of another one, %p",
+        this, this->_streams);
+  }
+};
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -2,6 +2,7 @@
 #define CUDA_INTEGER_COMPRESSION_H

 #include "../../pbs/pbs_enums.h"
+#include "../integer.h"

 typedef struct {
  void *ptr;
@@ -25,77 +26,65 @@ typedef struct {

 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);

 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
-    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t num_blocks_to_decompress, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
    int8_t *mem_ptr);

 void cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_in,
    uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);

-void cleanup_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
+                                                       int8_t **mem_ptr_void);

-void cleanup_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
+                                                         int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);

 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    bool allocate_gpu_memory);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory);

 void cuda_integer_compress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
    int8_t *mem_ptr);

 void cuda_integer_decompress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
    CudaPackedGlweCiphertextListFFI const *glwe_in,
    uint32_t const *indexes_array, int8_t *mem_ptr);

-void cleanup_cuda_integer_compress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
+                                                        int8_t **mem_ptr_void);

 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+    CudaStreamsFFI streams, int8_t **mem_ptr_void);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -12,8 +12,7 @@ template <typename Torus> struct int_compression {
  bool gpu_memory_allocated;
  uint32_t lwe_per_glwe;

-  int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                  uint32_t gpu_count, int_radix_params compression_params,
+  int_compression(CudaStreams streams, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
@@ -25,26 +24,29 @@ template <typename Torus> struct int_compression {
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
-        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory));
+        streams.stream(0), streams.gpu_index(0), size_tracker,
+        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
-            gpu_indexes[0], size_tracker, allocate_gpu_memory));
+            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
+            streams.stream(0), streams.gpu_index(0), size_tracker,
+            allocate_gpu_memory));

    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe<Torus>(
-        streams[0], gpu_indexes[0], &fp_ks_buffer,
+        streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
        compression_params.small_lwe_dimension,
        compression_params.glwe_dimension, compression_params.polynomial_size,
        num_radix_blocks, allocate_gpu_memory);
  }
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-    cuda_drop_with_size_tracking_async(tmp_lwe, streams[0], gpu_indexes[0],
+  void release(CudaStreams streams) {
+    cuda_drop_with_size_tracking_async(
+        tmp_lwe, streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams.stream(0),
+                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
    cleanup_packing_keyswitch_lwe_list_to_glwe(
-        streams[0], gpu_indexes[0], &fp_ks_buffer, gpu_memory_allocated);
+        streams.stream(0), streams.gpu_index(0), &fp_ks_buffer,
+        gpu_memory_allocated);
  }
 };

@@ -60,8 +62,7 @@ template <typename Torus> struct int_decompression {
  int_radix_lut<Torus> *decompression_rescale_lut;
  bool gpu_memory_allocated;

-  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                    uint32_t gpu_count, int_radix_params encryption_params,
+  int_decompression(CudaStreams streams, int_radix_params encryption_params,
                    int_radix_params compression_params,
                    uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
                    uint64_t &size_tracker) {
@@ -78,19 +79,21 @@ template <typename Torus> struct int_decompression {

    tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
        num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
-        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        streams.stream(0), streams.gpu_index(0), size_tracker,
+        allocate_gpu_memory);
    tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
-        num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
+        num_blocks_to_decompress * sizeof(uint32_t), streams.stream(0),
+        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
        num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
-        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        streams.stream(0), streams.gpu_index(0), size_tracker,
+        allocate_gpu_memory);

    // rescale is only needed on 64-bit decompression
    if constexpr (std::is_same_v<Torus, uint64_t>) {
      decompression_rescale_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, encryption_params, 1,
-          num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
+          streams, encryption_params, 1, num_blocks_to_decompress,
+          allocate_gpu_memory, size_tracker);

      // Rescale is done using an identity LUT
      // Here we do not divide by message_modulus
@@ -98,8 +101,8 @@ template <typename Torus> struct int_decompression {
      // space, we want to keep the original 2-bit value in the 4-bit space,
      // so we apply the identity and the encoding will rescale it for us.
      decompression_rescale_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, encryption_params, 1,
-          num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
+          streams, encryption_params, 1, num_blocks_to_decompress,
+          allocate_gpu_memory, size_tracker);
      auto decompression_rescale_f = [](Torus x) -> Torus { return x; };

      auto effective_compression_message_modulus =
@@ -107,7 +110,8 @@ template <typename Torus> struct int_decompression {
      auto effective_compression_carry_modulus = 1;

      generate_device_accumulator_with_encoding<Torus>(
-          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
+          streams.stream(0), streams.gpu_index(0),
+          decompression_rescale_lut->get_lut(0, 0),
          decompression_rescale_lut->get_degree(0),
          decompression_rescale_lut->get_max_degree(0),
          encryption_params.glwe_dimension, encryption_params.polynomial_size,
@@ -115,22 +119,22 @@ template <typename Torus> struct int_decompression {
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
          decompression_rescale_f, gpu_memory_allocated);
-      auto active_gpu_count =
-          get_active_gpu_count(num_blocks_to_decompress, gpu_count);
-      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
-                                               active_gpu_count);
+      auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
+      decompression_rescale_lut->broadcast_lut(active_streams);
    }
  }
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-    cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_indexes_array, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
+  void release(CudaStreams streams) {
+    cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams.stream(0),
+                                       streams.gpu_index(0),
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams.stream(0),
+                                       streams.gpu_index(0),
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_indexes_array, streams.stream(0),
+                                       streams.gpu_index(0),
+                                       gpu_memory_allocated);
    if constexpr (std::is_same_v<Torus, uint64_t>) {
-      decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
+      decompression_rescale_lut->release(streams);
      delete decompression_rescale_lut;
      decompression_rescale_lut = nullptr;
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -47,12 +47,11 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride);

 void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
                                                       const uint32_t gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -6,36 +6,26 @@
 #include <stdint.h>

 extern "C" {
-
-void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
-                        void *lwe_array_out, const void *lwe_compact_array_in,
-                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list);
-
 uint64_t scratch_cuda_expand_without_verification_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
-    uint32_t computing_ks_level, uint32_t computing_ks_base_log,
-    uint32_t casting_input_dimension, uint32_t casting_output_dimension,
-    uint32_t casting_ks_level, uint32_t casting_ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor,
-    const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
-    uint32_t num_compact_lists, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t computing_ks_level,
+    uint32_t computing_ks_base_log, uint32_t casting_input_dimension,
+    uint32_t casting_output_dimension, uint32_t casting_ks_level,
+    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
+    const bool *is_boolean_array, uint32_t num_compact_lists,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    KS_TYPE casting_key_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, const void *lwe_flattened_compact_array_in,
-    int8_t *mem_ptr, void *const *bsks, void *const *computing_ksks,
-    void *const *casting_keys,
+    CudaStreamsFFI streams, void *lwe_array_out,
+    const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

-void cleanup_expand_without_verification_64(void *const *streams,
-                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count,
+void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
                                            int8_t **mem_ptr_void);
 }
 #endif // ZK_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -5,6 +5,96 @@
 #include "integer/integer.cuh"
 #include <cstdint>

+////////////////////////////////////
+// Helper structures used in expand
+template <typename Torus> struct lwe_mask {
+  Torus *mask;
+
+  lwe_mask(Torus *mask) : mask{mask} {}
+};
+
+template <typename Torus> struct compact_lwe_body {
+  Torus *body;
+  uint64_t monomial_degree;
+
+  /* Body id is the index of the body in the compact ciphertext list.
+   *  It's used to compute the rotation.
+   */
+  compact_lwe_body(Torus *body, const uint64_t body_id)
+      : body{body}, monomial_degree{body_id} {}
+};
+
+template <typename Torus> struct compact_lwe_list {
+  Torus *ptr;
+  uint32_t lwe_dimension;
+  uint32_t total_num_lwes;
+
+  compact_lwe_list(Torus *ptr, uint32_t lwe_dimension, uint32_t total_num_lwes)
+      : ptr{ptr}, lwe_dimension{lwe_dimension}, total_num_lwes{total_num_lwes} {
+  }
+
+  lwe_mask<Torus> get_mask() { return lwe_mask(ptr); }
+
+  // Returns the index-th body
+  compact_lwe_body<Torus> get_body(uint32_t index) {
+    if (index >= total_num_lwes) {
+      PANIC("index out of range in compact_lwe_list::get_body");
+    }
+
+    return compact_lwe_body(&ptr[lwe_dimension + index], uint64_t(index));
+  }
+};
+
+template <typename Torus> struct flattened_compact_lwe_lists {
+  Torus *d_ptr;
+  Torus **d_ptr_to_compact_list;
+  const uint32_t *h_num_lwes_per_compact_list;
+  uint32_t num_compact_lists;
+  uint32_t lwe_dimension;
+  uint32_t total_num_lwes;
+
+  flattened_compact_lwe_lists(Torus *d_ptr,
+                              const uint32_t *h_num_lwes_per_compact_list,
+                              uint32_t num_compact_lists,
+                              uint32_t lwe_dimension)
+      : d_ptr(d_ptr), h_num_lwes_per_compact_list(h_num_lwes_per_compact_list),
+        num_compact_lists(num_compact_lists), lwe_dimension(lwe_dimension) {
+    d_ptr_to_compact_list =
+        static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus **)));
+    total_num_lwes = 0;
+    auto curr_list = d_ptr;
+    for (auto i = 0; i < num_compact_lists; ++i) {
+      total_num_lwes += h_num_lwes_per_compact_list[i];
+      d_ptr_to_compact_list[i] = curr_list;
+      curr_list += lwe_dimension + h_num_lwes_per_compact_list[i];
+    }
+  }
+
+  compact_lwe_list<Torus> get_device_compact_list(uint32_t compact_list_index) {
+    if (compact_list_index >= num_compact_lists) {
+      PANIC("index out of range in flattened_compact_lwe_lists::get");
+    }
+
+    return compact_lwe_list(d_ptr_to_compact_list[compact_list_index],
+                            lwe_dimension,
+                            h_num_lwes_per_compact_list[compact_list_index]);
+  }
+};
+
+/*
+ * A expand_job tells the expand kernel exactly which input mask and body to use
+ * and what rotation to apply
+ */
+template <typename Torus> struct expand_job {
+  lwe_mask<Torus> mask_to_use;
+  compact_lwe_body<Torus> body_to_use;
+
+  expand_job(lwe_mask<Torus> mask_to_use, compact_lwe_body<Torus> body_to_use)
+      : mask_to_use{mask_to_use}, body_to_use{body_to_use} {}
+};
+
+////////////////////////////////////
+
 template <typename Torus> struct zk_expand_mem {
  int_radix_params computing_params;
  int_radix_params casting_params;
@@ -17,13 +107,13 @@ template <typename Torus> struct zk_expand_mem {
  Torus *tmp_expanded_lwes;
  Torus *tmp_ksed_small_to_big_expanded_lwes;

-  uint32_t *d_lwe_compact_input_indexes;
-
-  uint32_t *d_body_id_per_compact_list;
  bool gpu_memory_allocated;

-  zk_expand_mem(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                uint32_t gpu_count, int_radix_params computing_params,
+  uint32_t *num_lwes_per_compact_list;
+  expand_job<Torus> *d_expand_jobs;
+  expand_job<Torus> *h_expand_jobs;
+
+  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
                const bool *is_boolean_array, uint32_t num_compact_lists,
@@ -33,9 +123,17 @@ template <typename Torus> struct zk_expand_mem {
        casting_key_type(casting_key_type) {

    gpu_memory_allocated = allocate_gpu_memory;
+
+    // We copy num_lwes_per_compact_list so we get protection against
+    // num_lwes_per_compact_list being freed while this buffer is still in use
+    this->num_lwes_per_compact_list =
+        (uint32_t *)malloc(num_compact_lists * sizeof(uint32_t));
+    memcpy(this->num_lwes_per_compact_list, num_lwes_per_compact_list,
+           num_compact_lists * sizeof(uint32_t));
+
    num_lwes = 0;
    for (int i = 0; i < num_compact_lists; i++) {
-      num_lwes += num_lwes_per_compact_list[i];
+      num_lwes += this->num_lwes_per_compact_list[i];
    }

    if (computing_params.carry_modulus != computing_params.message_modulus) {
@@ -73,11 +171,10 @@ template <typename Torus> struct zk_expand_mem {
      params = computing_params;
    }
    message_and_carry_extract_luts = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, params, 4, 2 * num_lwes,
-        allocate_gpu_memory, size_tracker);
+        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);

    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0],
+        streams.stream(0), streams.gpu_index(0),
        message_and_carry_extract_luts->get_lut(0, 0),
        message_and_carry_extract_luts->get_degree(0),
        message_and_carry_extract_luts->get_max_degree(0),
@@ -85,7 +182,7 @@ template <typename Torus> struct zk_expand_mem {
        params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);

    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0],
+        streams.stream(0), streams.gpu_index(0),
        message_and_carry_extract_luts->get_lut(0, 1),
        message_and_carry_extract_luts->get_degree(1),
        message_and_carry_extract_luts->get_max_degree(1),
@@ -93,7 +190,7 @@ template <typename Torus> struct zk_expand_mem {
        params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);

    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0],
+        streams.stream(0), streams.gpu_index(0),
        message_and_carry_extract_luts->get_lut(0, 2),
        message_and_carry_extract_luts->get_degree(2),
        message_and_carry_extract_luts->get_max_degree(2),
@@ -102,7 +199,7 @@ template <typename Torus> struct zk_expand_mem {
        gpu_memory_allocated);

    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0],
+        streams.stream(0), streams.gpu_index(0),
        message_and_carry_extract_luts->get_lut(0, 3),
        message_and_carry_extract_luts->get_degree(3),
        message_and_carry_extract_luts->get_max_degree(3),
@@ -110,9 +207,12 @@ template <typename Torus> struct zk_expand_mem {
        params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
        gpu_memory_allocated);

-    // Hint for future readers: if message_modulus == 4 then
-    // packed_messages_per_lwe becomes 2
-    auto num_packed_msgs = log2_int(params.message_modulus);
+    // We are always packing two LWEs. We just need to be sure we have enough
+    // space in the carry part to store a message of the same size as is in the
+    // message part.
+    if (params.carry_modulus < params.message_modulus)
+      PANIC("Carry modulus must be at least as large as message modulus");
+    auto num_packed_msgs = 2;

    // Adjust indexes to permute the output and access the correct LUT
    auto h_indexes_in = static_cast<Torus *>(
@@ -121,49 +221,14 @@ template <typename Torus> struct zk_expand_mem {
        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_lut_indexes = static_cast<Torus *>(
        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
-    auto h_body_id_per_compact_list =
-        static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
-    auto h_lwe_compact_input_indexes =
-        static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));

-    d_body_id_per_compact_list =
-        static_cast<uint32_t *>(cuda_malloc_with_size_tracking_async(
-            num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-            size_tracker, allocate_gpu_memory));
-    d_lwe_compact_input_indexes =
-        static_cast<uint32_t *>(cuda_malloc_with_size_tracking_async(
-            num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-            size_tracker, allocate_gpu_memory));
+    d_expand_jobs =
+        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
+            num_lwes * sizeof(expand_job<Torus>), streams.stream(0),
+            streams.gpu_index(0), size_tracker, allocate_gpu_memory));

-    auto compact_list_id = 0;
-    auto idx = 0;
-    auto count = 0;
-    // During flattening, all num_lwes LWEs from all compact lists are stored
-    // sequentially on a Torus array. h_lwe_compact_input_indexes stores the
-    // index of the first LWE related to the compact list that contains the i-th
-    // LWE
-    for (int i = 0; i < num_lwes; i++) {
-      h_lwe_compact_input_indexes[i] = idx;
-      count++;
-      if (count == num_lwes_per_compact_list[compact_list_id]) {
-        compact_list_id++;
-        idx += casting_params.big_lwe_dimension + count;
-        count = 0;
-      }
-    }
-
-    // Stores the index of the i-th LWE (within each compact list) related to
-    // the k-th compact list.
-    auto offset = 0;
-    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
-      uint32_t body_count = 0;
-      for (int i = 0; i < num_lwes_in_kth_compact_list; i++) {
-        h_body_id_per_compact_list[i + offset] = body_count;
-        body_count++;
-      }
-      offset += num_lwes_in_kth_compact_list;
-    }
+    h_expand_jobs = static_cast<expand_job<Torus> *>(
+        malloc(num_lwes * sizeof(expand_job<Torus>)));

    /*
     * Each LWE contains encrypted data in both carry and message spaces
@@ -198,9 +263,9 @@ template <typename Torus> struct zk_expand_mem {
     * num_packed_msgs to use the sanitization LUT (which ensures output is
     * exactly 0 or 1).
     */
-    offset = 0;
+    auto offset = 0;
    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+      auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
        auto lwe_index = i + num_packed_msgs * offset;
        auto lwe_index_in_list = i % num_lwes_in_kth;
@@ -217,61 +282,53 @@ template <typename Torus> struct zk_expand_mem {
    }

    message_and_carry_extract_luts->set_lwe_indexes(
-        streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
+        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
    auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);

-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        d_lwe_compact_input_indexes, h_lwe_compact_input_indexes,
-        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-        allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
-        streams[0], gpu_indexes[0], allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        d_body_id_per_compact_list, h_body_id_per_compact_list,
-        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-        allocate_gpu_memory);
+        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

-    auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
-    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
-                                                  active_gpu_count);
+    auto active_streams = streams.active_gpu_subset(2 * num_lwes);
+    message_and_carry_extract_luts->broadcast_lut(active_streams);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
-        streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
-        allocate_gpu_memory);
+        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
    // The expanded LWEs will always be on the casting key format
    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
        num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        streams.stream(0), streams.gpu_index(0), size_tracker,
+        allocate_gpu_memory);

    tmp_ksed_small_to_big_expanded_lwes =
        (Torus *)cuda_malloc_with_size_tracking_async(
            num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
-            streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
+            streams.stream(0), streams.gpu_index(0), size_tracker,
+            allocate_gpu_memory);

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_indexes_in);
    free(h_indexes_out);
    free(h_lut_indexes);
-    free(h_body_id_per_compact_list);
-    free(h_lwe_compact_input_indexes);
  }

-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
+  void release(CudaStreams streams) {

-    message_and_carry_extract_luts->release(streams, gpu_indexes, gpu_count);
+    message_and_carry_extract_luts->release(streams);
    delete message_and_carry_extract_luts;

-    cuda_drop_with_size_tracking_async(d_body_id_per_compact_list, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_lwe_compact_input_indexes, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
-                                       streams[0], gpu_indexes[0],
+    cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
+                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
+                                       streams.stream(0), streams.gpu_index(0),
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
+                                       streams.gpu_index(0),
+                                       gpu_memory_allocated);
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    free(num_lwes_per_compact_list);
+    free(h_expand_jobs);
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -49,17 +49,16 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
 * global memory
 */
 template <typename T, typename ST, class params>
-void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
-                           uint32_t gpu_count, double2 *dest, T *src,
+void batch_fft_ggsw_vector(CudaStreams streams, double2 *dest, T *src,
                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
                           uint32_t polynomial_size, uint32_t level_count,
                           uint32_t max_shared_memory) {
-  PANIC_IF_FALSE(gpu_count == 1,
+  PANIC_IF_FALSE(streams.count() == 1,
                 "GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
                 "gpus is not supported yet.",
-                 gpu_count);
+                 streams.count());

-  cuda_set_device(gpu_indexes[0]);
+  cuda_set_device(streams.gpu_index(0));

  int shared_memory_size = sizeof(double) * polynomial_size;

@@ -68,11 +67,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,

  if (max_shared_memory < shared_memory_size) {
    device_batch_fft_ggsw_vector<T, ST, params, NOSM>
-        <<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
+        <<<gridSize, blockSize, 0, streams.stream(0)>>>(dest, src, d_mem);
  } else {
    device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
-        <<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
-                                                                  d_mem);
+        <<<gridSize, blockSize, shared_memory_size, streams.stream(0)>>>(
+            dest, src, d_mem);
  }
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -142,8 +142,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
 }

 template <typename Torus>
-void execute_keyswitch_async(cudaStream_t const *streams,
-                             uint32_t const *gpu_indexes, uint32_t gpu_count,
+void execute_keyswitch_async(CudaStreams streams,
                             const LweArrayVariant<Torus> &lwe_array_out,
                             const LweArrayVariant<Torus> &lwe_output_indexes,
                             const LweArrayVariant<Torus> &lwe_array_in,
@@ -154,8 +153,9 @@ void execute_keyswitch_async(cudaStream_t const *streams,

  /// If the number of radix blocks is lower than the number of GPUs, not all
  /// GPUs will be active and there will be 1 input per GPU
-  for (uint i = 0; i < gpu_count; i++) {
-    int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
+  for (uint i = 0; i < streams.count(); i++) {
+    int num_samples_on_gpu =
+        get_num_inputs_on_gpu(num_samples, i, streams.count());

    Torus *current_lwe_array_out = get_variant_element(lwe_array_out, i);
    Torus *current_lwe_output_indexes =
@@ -166,7 +166,7 @@ void execute_keyswitch_async(cudaStream_t const *streams,

    // Compute Keyswitch
    host_keyswitch_lwe_ciphertext_vector<Torus>(
-        streams[i], gpu_indexes[i], current_lwe_array_out,
+        streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
        current_lwe_output_indexes, current_lwe_array_in,
        current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
        base_log, level_count, num_samples_on_gpu);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,15 +1,42 @@
 #include "device.h"
+
+#include <atomic>
 #include <cstdint>
 #include <cuda_runtime.h>
+#include <deque>
+#include <unordered_map>
 #include <mutex>

+#ifdef USE_NVTOOLS
+#include <cuda_profiler_api.h>
+#endif
+
+#ifdef CUDA_STREAM_POOL
+#include <deque>
+#include <vector>
+#include <unordered_map>
+#endif
+
+#include <bits/this_thread_sleep.h>
+
+#define USE_MEMORY_MANAGER
+// #define DEBUG_MEMORY_MANAGER
+#define MAX_CACHE_SIZE (1 << 30)
+
+#ifdef USE_MEMORY_MANAGER
+#include <list>
+#include <sstream>
+#include <string>
+#include <thread>
+#endif
+
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
  return static_cast<uint32_t>(device);
 }
-std::mutex pool_mutex;
-bool mem_pools_enabled = false;
+
+std::atomic<bool> mem_pools_enabled = false;

 // We use memory pools to reduce some overhead of memory allocations due
 // to our scratch/release pattern. This function is the simplest way of using
@@ -26,13 +53,13 @@ bool mem_pools_enabled = false;
 // We tested more complex configurations of mempools, but they did not yield
 // better results.
 void cuda_setup_mempool(uint32_t caller_gpu_index) {
-  if (!mem_pools_enabled) {
-    pool_mutex.lock();
-    if (mem_pools_enabled)
-      return; // If mem pools are already enabled, we don't need to do anything

-    // We do it only once for all GPUs
-    mem_pools_enabled = true;
+  bool pools_not_initialized = false;
+  bool pools_initialized = true;
+
+  // if pools_not_initialized is found, mem_pools_enabled is set to pools_initialized
+  // and the if body runs
+  if (mem_pools_enabled.compare_exchange_strong(pools_not_initialized, pools_initialized)) {
    uint32_t num_gpus = cuda_get_number_of_gpus();
    for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
      cuda_set_device(gpu_index);
@@ -75,7 +102,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
    }
    // We return to the original gpu_index
    cuda_set_device(caller_gpu_index);
-    pool_mutex.unlock();
  }
 }

@@ -83,8 +109,335 @@ void cuda_set_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
  // Mempools are initialized only once in all the GPUS available
  cuda_setup_mempool(gpu_index);
+#ifdef USE_NVTOOLS
+  check_cuda_error(cudaProfilerStart());
+#endif
 }

+#ifdef USE_MEMORY_MANAGER
+
+enum CudaMemBlockUsageType { CUDA_ALLOC = 0, MEMSET, MEMCPY_SRC, MEMCPY_DEST, FREE };
+
+enum CudaAllocType { SYNC = 0, ASYNC };
+
+#ifdef DEBUG_MEMORY_MANAGER
+struct CudaMemBlockUsage {
+  std::string location;
+  uint64_t timestamp;
+  CudaMemBlockUsageType type;
+};
+#endif
+
+struct CudaMemBlock {
+  int8_t *ptr;
+  uint64_t size;
+  cudaStream_t stream;
+  uint32_t gpu_index;
+  size_t thread_id;
+  CudaAllocType alloc_type;
+#ifdef DEBUG_MEMORY_MANAGER
+  std::vector<CudaMemBlockUsage> usages;
+#endif
+};
+
+class CudaMemoryManager {
+  std::list<CudaMemBlock> cuda_allocs; // fresh allocs
+  std::list<CudaMemBlock> cuda_freed;  // freed for good
+
+  std::unordered_map<cudaStream_t,
+                     std::unordered_map<uint64_t, std::deque<CudaMemBlock>>>
+      cache; // freed and re-used
+  uint64_t cache_size = 0, peak_cache_size = 0;
+
+  std::mutex allocs_mutex;
+
+#ifdef DEBUG_MEMORY_MANAGER
+  std::string make_location(const char *file, int line) {
+    std::stringstream sstr;
+    sstr << file << ":" << line;
+    return sstr.str();
+  }
+  uint64_t make_timestamp() {
+    const std::chrono::time_point<std::chrono::system_clock> now =
+        std::chrono::system_clock::now();
+
+    auto us = std::chrono::duration_cast<std::chrono::microseconds>(
+                  now.time_since_epoch())
+                  .count() %
+              1000000;
+    return us;
+  }
+
+  void check_range_is_valid(CudaMemBlockUsageType usage_type, int8_t *dest,
+                            uint64_t size, cudaStream_t stream,
+                            const char *file, int line) {
+    CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
+                               usage_type};
+
+    const char *info = NULL;
+    switch (usage_type) {
+    case MEMSET:
+      info = "memset";
+      break;
+    case MEMCPY_SRC:
+      info = "memcpy source";
+      break;
+    case MEMCPY_DEST:
+      info = "memcpy dest";
+      break;
+    default:
+      info = "unknown";
+    }
+
+    auto device_id = cuda_get_device();
+
+    bool found = false;
+    for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
+      if (it->ptr == dest && it->gpu_index == device_id) {
+        printf("%s with size tracking: found ptr %p\n", info, dest);
+        if (size > it->size) {
+          PANIC("%s OF %lu bytes TOO BIG TO %p OF SIZE %ld\n", info, size, dest,
+                it->size);
+        }
+        it->usages.push_back(usage);
+        found = true;
+      } else {
+        if (dest > it->ptr && dest < it->ptr + it->size &&
+            it->gpu_index == device_id) {
+          printf("%s with size tracking: indirect ptr %p in buffer %p\n", info,
+                 dest, it->ptr);
+          if (dest + size > it->ptr + it->size) {
+            auto remain_bytes = it->ptr + it->size - dest;
+            PANIC("%s OF %lu bytes TOO BIG TO %p WHICH HAS ROOM ONLY FOR %d\n",
+                  info, size, dest, remain_bytes);
+          }
+          it->usages.push_back(usage);
+          found = true;
+        }
+      }
+    }
+    if (!found) {
+      PANIC("Cuda %s to %p of size %lu, unknown pointer", info, dest, size);
+    }
+  }
+#endif
+
+public:
+  void alloc(void **ptr, uint64_t size, CudaAllocType alloc_type,
+             uint32_t gpu_index, cudaStream_t stream, const char *file,
+             int line) {
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    auto cache_of_stream = cache.find(stream);
+    if (cache_of_stream != cache.end()) {
+      auto cache_of_size = cache_of_stream->second.find(size);
+      if (cache_of_size != cache_of_stream->second.end() &&
+          !cache_of_size->second.empty()) {
+        auto cached_alloc = cache_of_size->second.front();
+        cache_of_size->second.pop_front();
+
+        // move to active allocs
+        cuda_allocs.push_back(cached_alloc);
+        *ptr = cached_alloc.ptr;
+
+        if (cache_size < size) {
+          PANIC("INVALID CACHE USE!!");
+        }
+
+        cache_size -= size;
+
+#ifdef DEBUG_MEMORY_MANAGER
+        printf("Cuda Allocation serviced from cache: %p of size %lu on gpu %d "
+               "in %s\n",
+               ptr, size, gpu_index, "");
+#endif
+        return;
+      }
+    }
+
+    cuda_set_device(gpu_index);
+    if (alloc_type == SYNC) {
+      check_cuda_error(cudaMalloc(ptr, size));
+    } else if (alloc_type == ASYNC) {
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11020)
+      int support_async_alloc;
+      check_cuda_error(cudaDeviceGetAttribute(
+          &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
+
+      if (support_async_alloc) {
+        check_cuda_error(cudaMallocAsync(ptr, size, stream));
+      } else {
+        check_cuda_error(cudaMalloc(ptr, size));
+      }
+#else
+      check_cuda_error(cudaMalloc((void **)&ptr, size));
+#endif
+    } else {
+      PANIC("Invalid allocation mode");
+    }
+
+    if (*ptr == nullptr) {
+      if (size > 0) {
+        PANIC("Allocation failed for %lu bytes, allocator returned %p", size,
+              ptr);
+      }
+      return;
+    }
+
+    auto thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id());
+    CudaMemBlock block = {(int8_t *)*ptr, size,      stream,
+                          gpu_index,      thread_id, alloc_type};
+#ifdef DEBUG_MEMORY_MANAGER
+    CudaMemBlockUsage usage = {make_location(file, line), make_timestamp(),
+                               CUDA_ALLOC};
+    block.usages.push_back(usage);
+
+    printf("Cuda Allocated %p of size %lu on gpu %d in %s\n", ptr, size,
+           gpu_index, usage.location.c_str());
+#endif
+
+    cuda_allocs.push_back(block);
+  }
+  void memset(int8_t *dest, uint64_t size, cudaStream_t stream,
+              const char *file, int line) {
+#ifdef DEBUG_MEMORY_MANAGER
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    check_range_is_valid(MEMSET, dest, size, stream, file, line);
+#endif
+  }
+
+  void memcpy(int8_t *dest, int8_t *src, uint64_t size, cudaStream_t stream,
+              const char *file, int line) {
+#ifdef DEBUG_MEMORY_MANAGER
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    check_range_is_valid(MEMCPY_SRC, src, size, stream, file, line);
+    check_range_is_valid(MEMCPY_DEST, src, size, stream, file, line);
+#endif
+  }
+
+  void free(void *ptr, CudaAllocType alloc_type, uint32_t gpu_index,
+            cudaStream_t stream, const char *file, int line) {
+    if (ptr == nullptr)
+      return;
+
+    std::lock_guard<std::mutex> guard(allocs_mutex);
+
+    bool found = false;
+    bool must_free = false;
+
+    for (auto it = cuda_allocs.begin(); it != cuda_allocs.end(); it++) {
+      if (it->ptr == ptr && it->gpu_index == gpu_index) {
+        found = true;
+
+        if (cache_size + it->size < (MAX_CACHE_SIZE)) {
+          cache[stream][it->size].push_back(*it);
+          cache_size += it->size;
+          if (peak_cache_size < cache_size) {
+            peak_cache_size = cache_size;
+          }
+        } else {
+          cuda_freed.push_back(*it);
+          must_free = true;
+        }
+#ifdef DEBUG_MEMORY_MANAGER
+        printf("cuda dropped buffer %p of size %lu on gpu %d\n", ptr, it->size,
+               gpu_index);
+#endif
+        cuda_allocs.erase(it++);
+      }
+    }
+
+    if (must_free) {
+      cuda_set_device(gpu_index);
+      if (alloc_type == SYNC) {
+        check_cuda_error(cudaFree(ptr));
+      } else if (alloc_type == ASYNC) {
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11020)
+        int support_async_alloc;
+        check_cuda_error(cudaDeviceGetAttribute(
+            &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
+
+        if (support_async_alloc) {
+          check_cuda_error(cudaFreeAsync(ptr, stream));
+        } else {
+          check_cuda_error(cudaFree(ptr));
+        }
+#else
+        check_cuda_error(cudaFree(ptr));
+#endif
+      }
+    }
+
+#ifdef DEBUG_MEMORY_MANAGER
+    if (!found) {
+      for (auto it = cuda_freed.begin(); it != cuda_freed.end(); it++) {
+        if (it->ptr == ptr && it->gpu_index == gpu_index) {
+          found = true;
+          printf("Drop in %s: %d\n", file, line);
+          printf("Alloc in %s\n", it->usages[0].location.c_str());
+          PANIC("cuda drop already dropped buffer %p of size %lu on gpu %d\n",
+                ptr, it->size, gpu_index);
+        }
+      }
+    }
+
+    if (!found) {
+      PANIC("cuda drop unknown buffer %p\n", ptr);
+    }
+#endif
+  }
+
+  ~CudaMemoryManager() {
+#ifdef DEBUG_MEMORY_MANAGER
+    printf("%lu ALLOCATIONS AT PROGRAM EXIT\n", cuda_allocs.size());
+
+    for (auto &cuda_alloc : cuda_allocs) {
+      printf("%p of size %lu allocated at %s\n", cuda_alloc.ptr,
+             cuda_alloc.size, cuda_alloc.usages[0].location.c_str());
+    }
+
+    printf("\n\n\n %llu PEAK CACHE SIZE\n", peak_cache_size);
+
+    for (auto &cache_for_size : cache) {
+      for (auto &cuda_alloc : cache_for_size.second) {
+        printf("%p of size %lu cached at %s\n", cuda_alloc.ptr, cuda_alloc.size,
+               cuda_alloc.usages[0].location.c_str());
+      }
+    }
+#endif
+  }
+};
+
+class CudaMultiGPUMemoryManager {
+  std::unordered_map<uint32_t, CudaMemoryManager> gMemManagers;
+  std::mutex gMemManagersMutex; // for creation of the mem managers
+  std::atomic<uint32_t> gMemManagerExists = 0;
+
+public:
+  CudaMemoryManager &get(uint32_t gpu_index) {
+    if (gMemManagerExists.load() & (1 << gpu_index)) {
+      return gMemManagers[gpu_index];
+    } else {
+      std::lock_guard<std::mutex> guard(gMemManagersMutex);
+      uint32_t exist_flags = gMemManagerExists.load();
+      if (!(exist_flags & (1 << gpu_index))) {
+        gMemManagers[gpu_index]; // create it
+        gMemManagerExists.store(exist_flags | (1 << gpu_index));
+      }
+      return gMemManagers[gpu_index];
+    }
+  }
+};
+
+CudaMultiGPUMemoryManager gCudaMemoryManager;
+#endif
+
 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
  cuda_set_device(gpu_index);
  cudaEvent_t event;
@@ -109,18 +462,90 @@ void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
  check_cuda_error(cudaEventDestroy(event));
 }

+#ifdef CUDA_STREAM_POOL
+struct CudaBoundStream
+{
+  cudaStream_t stream;
+  uint32_t gpu_index;
+};
+
+class CudaStreamPool
+{
+  std::vector<CudaBoundStream> poolCompute;
+  std::vector<CudaBoundStream> poolTransfer;
+
+  std::mutex mutex_pools;
+
+  size_t nextStream = 0;
+
+  const size_t MAX_STREAMS = 8;
+
+public:
+  cudaStream_t create_stream(uint32_t gpu_index)
+  {
+    std::lock_guard<std::mutex> lock(mutex_pools);
+    if (poolCompute.empty())
+    {
+      poolCompute.reserve(MAX_STREAMS);
+
+      cuda_set_device(gpu_index);
+      for (size_t i = 0; i < MAX_STREAMS; i++)
+      {
+        cudaStream_t stream;
+        check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+        poolCompute.push_back(CudaBoundStream{stream, gpu_index});
+      }
+    }
+
+    PANIC_IF_FALSE(gpu_index == poolCompute[nextStream].gpu_index, "Bad gpu in stream pool");
+    cudaStream_t res = poolCompute[nextStream].stream;
+    nextStream = (nextStream + 1) % poolCompute.size();
+    return res;
+  }
+
+  void destroy_stream(cudaStream_t stream, uint32_t gpu_index)
+  {
+    //do nothing
+  }
+};
+
+
+class CudaMultiStreamPool {
+  std::unordered_map<uint32_t, CudaStreamPool> per_gpu_pools;
+  std::mutex pools_mutex; // for creation of the mem managers
+
+public:
+  CudaStreamPool &get(uint32_t gpu_index) {
+      std::lock_guard<std::mutex> guard(pools_mutex);
+      return per_gpu_pools[gpu_index]; // creates it if it does not exist
+  }
+};
+
+CudaMultiStreamPool gCudaStreamPool;
+#endif
+
+
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
+#ifdef CUDA_STREAM_POOL
+  cuda_set_device(gpu_index); // this will initialize the mempool
+  return gCudaStreamPool.get(gpu_index).create_stream(gpu_index);
+#else
  cuda_set_device(gpu_index);
  cudaStream_t stream;
  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
+#endif
 }

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
+#ifdef CUDA_STREAM_POOL
+  gCudaStreamPool.get(gpu_index).destroy_stream(stream, gpu_index);
+#else
  cuda_set_device(gpu_index);
  check_cuda_error(cudaStreamDestroy(stream));
+#endif
 }

 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
@@ -128,13 +553,6 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
  check_cuda_error(cudaStreamSynchronize(stream));
 }

-void synchronize_streams(cudaStream_t const *streams,
-                         uint32_t const *gpu_indexes, uint32_t gpu_count) {
-  for (uint i = 0; i < gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-  }
-}
-
 // Determine if a CUDA device is available at runtime
 uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

@@ -142,24 +560,38 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
+  void *ptr = nullptr;
+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, SYNC, gpu_index, 0,
+                                          "rust_code", 0);
+#else
  cuda_set_device(gpu_index);
-  void *ptr;
  check_cuda_error(cudaMalloc((void **)&ptr, size));
-
+#endif
  return ptr;
 }

+void *cuda_ext_malloc(uint64_t size, uint32_t gpu_index) {
+  return cuda_malloc(size, gpu_index);
+}
+
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
-                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
-                                           bool allocate_gpu_memory) {
+void *cuda_intern_malloc_with_size_tracking_async(uint64_t size,
+                                                  cudaStream_t stream,
+                                                  uint32_t gpu_index,
+                                                  uint64_t &size_tracker,
+                                                  bool allocate_gpu_memory,
+                                                  const char *file, int line) {
  size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;

+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).alloc(&ptr, size, ASYNC, gpu_index, stream,
+                                          file, line);
+#else
  cuda_set_device(gpu_index);

 #ifndef CUDART_VERSION
@@ -177,16 +609,23 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 #else
  check_cuda_error(cudaMalloc((void **)&ptr, size));
 #endif
+#endif
+
  return ptr;
 }

 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
-                        uint32_t gpu_index) {
+void *cuda_int_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index, const char *file, int line) {
  uint64_t size_tracker = 0;
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
-                                              size_tracker, true);
+  return cuda_intern_malloc_with_size_tracking_async(
+      size, stream, gpu_index, size_tracker, true, file, line);
+}
+
+void *cuda_ext_malloc_async(uint64_t size, cudaStream_t stream,
+                            uint32_t gpu_index) {
+  return cuda_malloc_async(size, stream, gpu_index);
 }

 /// Check that allocation is valid
@@ -261,6 +700,11 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                                              gpu_index, true);
 }

+void cuda_ext_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index) {
+  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
+}
+
 /// Copy memory within a GPU asynchronously
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
@@ -293,6 +737,12 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                                  gpu_index, true);
 }

+void cuda_ext_memcpy_async_gpu_to_gpu(void *dest, void const *src,
+                                      uint64_t size, cudaStream_t stream,
+                                      uint32_t gpu_index) {
+  cuda_memcpy_async_gpu_to_gpu(dest, src, size, stream, gpu_index);
+}
+
 /// Copy memory within a GPU
 void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index) {
@@ -317,6 +767,11 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  }
 }

+void cuda_ext_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
+                                uint32_t gpu_index) {
+  cuda_memcpy_gpu_to_gpu(dest, src, size, gpu_index);
+}
+
 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
  cuda_set_device(gpu_index);
@@ -329,6 +784,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
+
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
@@ -336,6 +792,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
  }
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
+  gCudaMemoryManager.get(gpu_index).memset((int8_t *)dest, size, stream, "", 0);
 }

 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
@@ -344,6 +801,11 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                                       true);
 }

+void cuda_ext_memset_async(void *dest, uint64_t val, uint64_t size,
+                           cudaStream_t stream, uint32_t gpu_index) {
+  cuda_memset_async(dest, val, size, stream, gpu_index);
+}
+
 template <typename Torus>
 __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -395,6 +857,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }

+void cuda_ext_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index) {
+  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
+}
+
 /// Return number of GPUs available
 int cuda_get_number_of_gpus() {
  int num_gpus;
@@ -410,19 +877,31 @@ int cuda_get_number_of_sms() {
 }

 /// Drop a cuda array
-void cuda_drop(void *ptr, uint32_t gpu_index) {
+void cuda_int_drop(void *ptr, uint32_t gpu_index, const char *file, int line) {
+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).free(ptr, SYNC, gpu_index, 0, file, line);
+#else
  cuda_set_device(gpu_index);
  check_cuda_error(cudaFree(ptr));
+#endif
 }

+void cuda_ext_drop(void *ptr, uint32_t gpu_index) { cuda_drop(ptr, gpu_index); }
+
 /// Drop a cuda array asynchronously, if the data was allocated & it's supported
 /// on the device
-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
-                                        uint32_t gpu_index,
-                                        bool gpu_memory_allocated) {
+void cuda_int_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+                                            uint32_t gpu_index,
+                                            bool gpu_memory_allocated,
+                                            const char *file, int line) {

  if (!gpu_memory_allocated)
    return;
+
+#ifdef USE_MEMORY_MANAGER
+  gCudaMemoryManager.get(gpu_index).free(ptr, ASYNC, gpu_index, stream, file,
+                                         line);
+#else
  cuda_set_device(gpu_index);
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
@@ -439,11 +918,14 @@ void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
 #else
  check_cuda_error(cudaFree(ptr));
 #endif
+#endif
 }

 /// Drop a cuda array asynchronously, if supported on the device
-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
-  cuda_drop_with_size_tracking_async(ptr, stream, gpu_index, true);
+void cuda_int_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index,
+                         const char *file, int line) {
+  cuda_int_drop_with_size_tracking_async(ptr, stream, gpu_index, true, file,
+                                         line);
 }

 /// Get the maximum size for the shared memory per streaming multiprocessors
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -1,13 +1,12 @@
 #include "integer/abs.cuh"

 uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -16,31 +15,27 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
                          message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_abs_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
-      allocate_gpu_memory);
+      CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
+      num_blocks, params, allocate_gpu_memory);
 }

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
-    void *const *bsks, void *const *ksks,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
+    bool is_signed, void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

-  host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
-                                gpu_count, ct, bsks, (uint64_t **)(ksks),
-                                ms_noise_reduction_key, mem, is_signed);
+  host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
+                                (uint64_t **)(ksks), ms_noise_reduction_key,
+                                mem, is_signed);
 }

-void cleanup_cuda_integer_abs_inplace(void *const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count,
+void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void) {
  int_abs_buffer<uint64_t> *mem_ptr =
      (int_abs_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -18,14 +18,12 @@

 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_abs_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
+    CudaStreams streams, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  if (is_signed) {
-    *mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
-                                         params, num_blocks,
+    *mem_ptr = new int_abs_buffer<Torus>(streams, params, num_blocks,
                                         allocate_gpu_memory, size_tracker);
  }
  return size_tracker;
@@ -33,8 +31,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(

 template <typename Torus>
 __host__ void host_integer_abs_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *ct, void *const *bsks,
+    CudaStreams streams, CudaRadixCiphertextFFI *ct, void *const *bsks,
    uint64_t *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
@@ -47,24 +44,24 @@ __host__ void host_integer_abs_kb(
      (31 - __builtin_clz(mem_ptr->params.message_modulus)) *
      ct->num_radix_blocks;

-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mask, ct);
+  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
+                                     mask, ct);

  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
+      streams, mask, num_bits_in_ciphertext - 1,
      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
-  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
+  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
                       ct->num_radix_blocks, mem_ptr->params.message_modulus,
                       mem_ptr->params.carry_modulus);

  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
  host_propagate_single_carry<Torus>(
-      streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
-      bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
+      streams, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
+      ms_noise_reduction_key, requested_flag, uses_carry);

-  host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
-                                     ct, mem_ptr->bitxor_mem, bsks, ksks,
-                                     ms_noise_reduction_key);
+  host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
+                                     bsks, ksks, ms_noise_reduction_key);
 }

 #endif // TFHE_RS_ABS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,14 +1,13 @@
 #include "integer/bitwise_ops.cuh"

 uint64_t scratch_cuda_integer_radix_bitop_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    BITOP_TYPE op_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -16,32 +15,28 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
                          message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
-      op_type, allocate_gpu_memory);
+      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
+      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
 }

 void cuda_bitop_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_1, lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), ms_noise_reduction_key);
+      CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      ms_noise_reduction_key);
 }

-void cleanup_cuda_integer_bitop(void *const *streams,
-                                uint32_t const *gpu_indexes, uint32_t gpu_count,
-                                int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -13,8 +13,7 @@

 template <typename Torus>
 __host__ void host_integer_radix_bitop_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
    void *const *bsks, Torus *const *ksks,
@@ -46,8 +45,8 @@ __host__ void host_integer_radix_bitop_kb(
  }

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
-      bsks, ksks, ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
+      streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
+      ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
      lut->params.message_modulus);

  memcpy(lwe_array_out->degrees, degrees,
@@ -56,14 +55,12 @@ __host__ void host_integer_radix_bitop_kb(

 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_bitop_buffer<Torus> **mem_ptr,
+    CudaStreams streams, int_bitop_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
    bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
-  *mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
-                                         params, num_radix_blocks,
+  *mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,
                                         allocate_gpu_memory, size_tracker);
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -2,28 +2,26 @@

 void extend_radix_with_trivial_zero_blocks_msb_64(
    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    void *const *streams, uint32_t const *gpu_indexes) {
+    CudaStreamsFFI streams) {
  host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
-      output, input, (cudaStream_t *)streams, gpu_indexes);
+      output, input, CudaStreams(streams));
 }

 void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
                              CudaRadixCiphertextFFI const *input,
-                              void *const *streams,
-                              uint32_t const *gpu_indexes) {
+                              CudaStreamsFFI streams) {

-  host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
-                                       gpu_indexes);
+  host_trim_radix_blocks_lsb<uint64_t>(output, input, CudaStreams(streams));
 }

 uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_additional_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks,
+    uint32_t num_additional_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -32,34 +30,31 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
                          noise_reduction_type);

  return scratch_extend_radix_with_sign_msb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count,
+      CudaStreams(streams),
      (int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
      num_blocks, num_additional_blocks, allocate_gpu_memory);
 }

 void cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
-    void *const *ksks,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
+    uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  PUSH_RANGE("cast")
  host_extend_radix_with_sign_msb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
+      CudaStreams(streams), output, input,
      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
  POP_RANGE()
 }

-void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
+void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
                                                int8_t **mem_ptr_void) {
  PUSH_RANGE("clean cast")
  int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  POP_RANGE()
  delete mem_ptr;
  *mem_ptr_void = nullptr;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -8,19 +8,18 @@
 template <typename Torus>
 __host__ void host_extend_radix_with_trivial_zero_blocks_msb(
    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    cudaStream_t const *streams, uint32_t const *gpu_indexes) {
+    CudaStreams streams) {
  PUSH_RANGE("extend only")
-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                           0, input->num_radix_blocks, input, 0,
-                                           input->num_radix_blocks);
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), output, 0,
+      input->num_radix_blocks, input, 0, input->num_radix_blocks);
  POP_RANGE()
 }

 template <typename Torus>
 __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
                                         CudaRadixCiphertextFFI const *input,
-                                         cudaStream_t const *streams,
-                                         uint32_t const *gpu_indexes) {
+                                         CudaStreams streams) {

  const uint32_t input_start_lwe_index =
      input->num_radix_blocks - output->num_radix_blocks;
@@ -31,30 +30,29 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
                 input->num_radix_blocks, output->num_radix_blocks);

  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
-      input_start_lwe_index, input->num_radix_blocks);
+      streams.stream(0), streams.gpu_index(0), output, 0,
+      output->num_radix_blocks, input, input_start_lwe_index,
+      input->num_radix_blocks);
 }

 template <typename Torus>
 __host__ uint64_t scratch_extend_radix_with_sign_msb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
+    CudaStreams streams, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
    const int_radix_params params, uint32_t num_radix_blocks,
    uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
  PUSH_RANGE("scratch cast/extend")
  uint64_t size_tracker = 0;

  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_additional_blocks, allocate_gpu_memory, size_tracker);
+      streams, params, num_radix_blocks, num_additional_blocks,
+      allocate_gpu_memory, size_tracker);
  POP_RANGE()
  return size_tracker;
 }

 template <typename Torus>
 __host__ void host_extend_radix_with_sign_msb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
+    CudaStreams streams, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *input,
    int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
@@ -62,8 +60,8 @@ __host__ void host_extend_radix_with_sign_msb(

  if (num_additional_blocks == 0) {
    PUSH_RANGE("cast/extend no addblocks")
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
-                                       input);
+    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
+                                       output, input);
    POP_RANGE()
    return;
  }
@@ -72,24 +70,24 @@ __host__ void host_extend_radix_with_sign_msb(

  PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");

-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                           0, input_blocks, input, 0,
-                                           input_blocks);
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), output, 0, input_blocks, input,
+      0, input_blocks);

-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                           mem_ptr->last_block, 0, 1, input,
-                                           input_blocks - 1, input_blocks);
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), mem_ptr->last_block, 0, 1, input,
+      input_blocks - 1, input_blocks);

-  host_apply_univariate_lut_kb(
-      streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
-      mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
+  host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
+                               mem_ptr->last_block, mem_ptr->lut, ksks,
+                               ms_noise_reduction_key, bsks);

  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
    uint32_t dst_block_idx = input_blocks + i;

-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                             dst_block_idx, dst_block_idx + 1,
-                                             mem_ptr->padding_block, 0, 1);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), output, dst_block_idx,
+        dst_block_idx + 1, mem_ptr->padding_block, 0, 1);
  }
  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,13 +1,13 @@
 #include "integer/cmux.cuh"

 uint64_t scratch_cuda_integer_radix_cmux_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch cmux")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,16 +18,14 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
      [](uint64_t x) -> uint64_t { return x == 1; };

  uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
-      lwe_ciphertext_count, params, allocate_gpu_memory);
+      CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
+      predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
  POP_RANGE()
  return ret;
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_condition,
    CudaRadixCiphertextFFI const *lwe_array_true,
    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
@@ -35,21 +33,18 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  PUSH_RANGE("cmux")
  host_integer_radix_cmux_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_condition, lwe_array_true, lwe_array_false,
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      ms_noise_reduction_key);
+      CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
+      lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), ms_noise_reduction_key);
  POP_RANGE()
 }

-void cleanup_cuda_integer_radix_cmux(void *const *streams,
-                                     uint32_t const *gpu_indexes,
-                                     uint32_t gpu_count,
+void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
                                     int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup cmux")
  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -6,8 +6,7 @@

 template <typename Torus>
 __host__ void
-zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-            uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
            CudaRadixCiphertextFFI const *lwe_array_input,
            CudaRadixCiphertextFFI const *lwe_condition,
            int_zero_out_if_buffer<Torus> *mem_ptr,
@@ -27,26 +26,25 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
      "Cuda error: input and output radix ciphertexts must have the same "
      "lwe dimension");

-  cuda_set_device(gpu_indexes[0]);
+  cuda_set_device(streams.gpu_index(0));
  auto params = mem_ptr->params;

  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
  // second operand is not an array
  auto tmp_lwe_array_input = mem_ptr->tmp;
  host_pack_bivariate_blocks_with_single_block<Torus>(
-      streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
-      predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
-      predicate->lwe_indexes_in, params.message_modulus, num_radix_blocks);
+      streams, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input,
+      lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
+      num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
-      ksks, ms_noise_reduction_key, predicate, num_radix_blocks);
+      streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks,
+      ms_noise_reduction_key, predicate, num_radix_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_condition,
    CudaRadixCiphertextFFI const *lwe_array_true,
    CudaRadixCiphertextFFI const *lwe_array_false,
@@ -62,18 +60,19 @@ __host__ void host_integer_radix_cmux_kb(
  auto params = mem_ptr->params;
  Torus lwe_size = params.big_lwe_dimension + 1;
  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], mem_ptr->buffer_in, 0, num_radix_blocks,
-      lwe_array_true, 0, num_radix_blocks);
+      streams.stream(0), streams.gpu_index(0), mem_ptr->buffer_in, 0,
+      num_radix_blocks, lwe_array_true, 0, num_radix_blocks);
  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], mem_ptr->buffer_in, num_radix_blocks,
-      2 * num_radix_blocks, lwe_array_false, 0, num_radix_blocks);
+      streams.stream(0), streams.gpu_index(0), mem_ptr->buffer_in,
+      num_radix_blocks, 2 * num_radix_blocks, lwe_array_false, 0,
+      num_radix_blocks);
  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                             mem_ptr->condition_array, i, i + 1,
-                                             lwe_condition, 0, 1);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), mem_ptr->condition_array, i,
+        i + 1, lwe_condition, 0, 1);
  }
  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
+      streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
      mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
      mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);

@@ -87,25 +86,24 @@ __host__ void host_integer_radix_cmux_kb(
  as_radix_ciphertext_slice<Torus>(&mem_false, mem_ptr->buffer_out,
                                   num_radix_blocks, 2 * num_radix_blocks);

-  host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
-                       &mem_false, num_radix_blocks, params.message_modulus,
-                       params.carry_modulus);
+  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &mem_true,
+                       &mem_true, &mem_false, num_radix_blocks,
+                       params.message_modulus, params.carry_modulus);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
-      ms_noise_reduction_key, mem_ptr->message_extract_lut, num_radix_blocks);
+      streams, lwe_array_out, &mem_true, bsks, ksks, ms_noise_reduction_key,
+      mem_ptr->message_extract_lut, num_radix_blocks);
 }

 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_cmux_buffer<Torus> **mem_ptr,
+    CudaStreams streams, int_cmux_buffer<Torus> **mem_ptr,
    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {
  uint64_t size_tracker = 0;
-  *mem_ptr = new int_cmux_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, predicate_lut_f, params,
-      num_radix_blocks, allocate_gpu_memory, size_tracker);
+  *mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
+                                        num_radix_blocks, allocate_gpu_memory,
+                                        size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,14 +1,13 @@
 #include "integer/comparison.cuh"

 uint64_t scratch_cuda_integer_radix_comparison_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch comparison")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -20,9 +19,8 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
  case EQ:
  case NE:
    size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
-        op_type, false, allocate_gpu_memory);
+        CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
+        num_radix_blocks, params, op_type, false, allocate_gpu_memory);
    break;
  case GT:
  case GE:
@@ -31,9 +29,8 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
  case MAX:
  case MIN:
    size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
-        op_type, is_signed, allocate_gpu_memory);
+        CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
+        num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
    break;
  }
  POP_RANGE()
@@ -41,8 +38,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
 }

 void cuda_comparison_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
@@ -60,9 +56,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  case EQ:
  case NE:
    host_integer_radix_equality_check_kb<uint64_t>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-        lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, num_radix_blocks);
+        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
+        bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
    break;
  case GT:
  case GE:
@@ -72,18 +67,17 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
      PANIC("Cuda error (comparisons): the number of radix blocks has to be "
            "even.")
    host_integer_radix_difference_check_kb<uint64_t>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-        lwe_array_1, lwe_array_2, buffer, buffer->diff_buffer->operator_f, bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
+        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
+        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, num_radix_blocks);
    break;
  case MAX:
  case MIN:
    if (num_radix_blocks % 2 != 0)
      PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
    host_integer_radix_maxmin_kb<uint64_t>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-        lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, num_radix_blocks);
+        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
+        bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
@@ -91,27 +85,25 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  POP_RANGE()
 }

-void cleanup_cuda_integer_comparison(void *const *streams,
-                                     uint32_t const *gpu_indexes,
-                                     uint32_t gpu_count,
+void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
                                     int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup comparison")
  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }

 uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -119,14 +111,12 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
                          message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
-      false, allocate_gpu_memory);
+      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
+      num_radix_blocks, params, EQ, false, allocate_gpu_memory);
 }

 void cuda_integer_are_all_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -136,30 +126,28 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
      (int_comparison_buffer<uint64_t> *)mem_ptr;

  host_integer_are_all_comparisons_block_true_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
-      num_radix_blocks);
+      CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
+      (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
 }

 void cleanup_cuda_integer_are_all_comparisons_block_true(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
+    CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }

 uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -167,14 +155,12 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
                          message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
-      false, allocate_gpu_memory);
+      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
+      num_radix_blocks, params, EQ, false, allocate_gpu_memory);
 }

 void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
@@ -184,18 +170,16 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
      (int_comparison_buffer<uint64_t> *)mem_ptr;

  host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
-      num_radix_blocks);
+      CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
+      (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
 }

 void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
+    CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  mem_ptr->release(CudaStreams(streams));
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Beka Barbakadze	846eed184e	feat(gpu): add memory management	2025-09-18 22:37:51 +02:00
Andrei Stoian	70fa68bf52	fix: max streams 8	2025-09-18 22:09:30 +02:00
Andrei Stoian	7faafd6602	fix(gpu): deadlock	2025-09-18 19:17:25 +02:00
Andrei Stoian	8c55f6b8d7	fix: rebase	2025-09-18 14:47:20 +02:00
Andrei Stoian	f6b1929a8d	feat(gpu): stream pools	2025-09-18 13:59:00 +02:00
Andrei Stoian	87c0d646a4	fix(gpu): coprocessor bench	2025-09-18 13:56:55 +02:00
Agnes Leroy	e5b39a6d4d	fix(gpu): fix memory leak in multi-gpu calculations	2025-09-18 13:55:03 +02:00
Arthur Meyre	27e2fbd972	chore: add implementation note for the NTT formula	2025-09-18 09:51:53 +02:00
Arthur Meyre	f54fbf52ce	chore: bump tfhe-ntt version to 0.6.1	2025-09-18 09:51:53 +02:00
Arthur Meyre	2a0dfa5b17	fix(ntt): same update for 64 bits code	2025-09-18 09:51:53 +02:00
Arthur Meyre	a4841036b7	fix: make sure computations don't overflow for certain primes for 32 bits - The original code seemed to assume that the Barrett reduction would not overflow if p <= 2^31, this is incorrect but rare - The correctness constraint has a bound much smaller than 2^31, some primes bigger than the derived threshold can still use the fast code given a certain criterion is respected which corresponds to a "lucky" case of the Barrett reduction, the new code now manages this maths explained in https://blog.zksecurity.xyz/posts/barrett-tighter-bound/ and copiously in comments in the code	2025-09-18 09:51:53 +02:00
Andrei Stoian	1dcc3c8c89	chore(gpu): structure to encapsulate streams	2025-09-18 09:43:17 +02:00
Nicolas Sarlin	1a2643d1da	fix(ci): use precise wasm-bindgen version for the cli	2025-09-17 13:17:57 +02:00
David Testé	bc257904e3	chore(ci): fix issue_comment trigger event for regression bench	2025-09-17 12:15:32 +02:00
Arthur Meyre	8982844a5b	chore: adapt naming of traits to better match current scheme - Standard -> Classic when referring to original PBS implementation	2025-09-17 10:32:40 +02:00
Arthur Meyre	e80d2548af	fix: fix noise simulation modulus instantiation	2025-09-17 10:32:40 +02:00
Arthur Meyre	c0ab0a5752	chore: split noise simulation primitives in sub modules - keep things easier to manage in terms of file size and content density	2025-09-17 10:32:40 +02:00
Arthur Meyre	f7bfe2f10c	chore: uniformize noise check tools naming	2025-09-17 10:32:40 +02:00
Arthur Meyre	29c390d92c	chore: reorg noise check tools	2025-09-17 10:32:40 +02:00
Pedro Alves	becd08db71	fix(gpu): fix an overflow that may happen when the user tries to allocate a huge amount of blocks	2025-09-16 16:17:32 -03:00
David Testé	ffd7470ef1	chore(ci): check if regression workflow should be run early Before, any issue comment or label event would trigger the verify-actor job. Then the next job, prepare-benchmarks, would check if the rest of the workflow should run. Moving this very check in verify-actor ensures the whole workflow to run only if required.	2025-09-16 20:55:54 +02:00
David Testé	a3750504c4	chore(ci): use dedicated token to sync repositories	2025-09-16 18:53:17 +02:00
David Testé	378c5ccb73	chore(ci): perform sync on push without third-party action This is done to better handle git-lfs related changes when syncing with another repository.	2025-09-16 16:29:44 +02:00
David Testé	4ba1787e12	chore(bench): add crs size in zk-pke benchmark names This is done get more details about the benchmarks when parsing results.	2025-09-16 16:06:41 +02:00
David Testé	366d359441	chore(bench): measure ciphertext and key sizes at a large scale Ciphertext sizes are measured at HLAPI layer with several parameters set. Keys sizes are measured at shortint level. This benchmark has now its dedicated GitHub workflow that would run, at least, each 24th of the month.	2025-09-16 15:43:36 +02:00
dependabot[bot]	0ece9e684a	chore(deps): bump tj-actions/changed-files from 46.0.5 to 47.0.0 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 46.0.5 to 47.0.0. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`ed68ef82c0...24d32ffd49`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-version: 47.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-16 14:02:32 +02:00
David Testé	f8684d1f67	chore(ci): add regression benchmark workflow Regression benchmarks are meant to be run in pull-request. They can be launched in two flavors: * issue comment: using command like "/bench --backend cpu" * adding a label: `bench-perfs-cpu` or `bench-perfs-gpu` Benchmark definitions are written in TOML and located at ci/regression.toml. While not exhaustive, it can be easily modified by reading the embbeded documentation. "/bench" commands are parsed by a Python script located at ci/perf_regression.py. This script produces output files that contains cargo commands and a shell script generating custom environment variables. The Python script and generated files are meant to be used only by the workflow benchmark_perf_regression.yml.	2025-09-16 13:33:49 +02:00
Nicolas Sarlin	b4066df77f	chore(ci): run cargo audit	2025-09-16 12:03:32 +02:00
Pedro Alves	6b94872a00	fix(gpu): add an assert to be sure the carry part has correct size in expand	2025-09-15 12:57:11 -03:00
Nicolas Sarlin	d88caff6dd	fix(ci): fix serde root crate in tfhe-lints	2025-09-15 15:18:58 +02:00
Thomas Montaigu	75a265f93b	fix(integer): fix aggregate_one_hot_vector `aggregate_one_hot_vector`` was modified when the KVStore was added to support inputs where information in the blocks was not packed. And to detect if blocks where packed it was relying on the degree value. However, the inputs may come from LUTs that had precise degree, and could lead to believe the inputs were not packed. To fix this we split in 2 fn: * aggregate_one_hot_vector * aggregate_and_unpack_one_hot_vector And use the correct one when we know if the inputs are packed	2025-09-15 10:27:24 +02:00
Nicolas Sarlin	bfbf638fed	fix(zk): add a size check for the public key	2025-09-12 11:10:06 +02:00
David Testé	01651d6fb2	chore(ci): update lattice estimator version	2025-09-12 11:07:25 +02:00
Pedro Alves	b2624d1a76	chore(gpu): refactor the indexing logic for the LWE expand	2025-09-11 13:10:18 -03:00
tmontaigu	9fb7b56629	feat(integer): add KVStore The KVStore is a Hash Table, with homomorphic capabilities The keys are meant to be clear integers, values are meant to be Radix/SignedRadix The ServerKey now has functions to be able to do operations that modify an existing key,value pair using an encrypted key.	2025-09-11 13:55:42 +02:00
Arthur Meyre	24feeb8609	chore(ci): avoid backward compat workflow cancel - re-use formulas from the integer workflow which also executes on main	2025-09-11 10:43:23 +02:00
pgardratzama	757c2fc828	chore(hpu): make hpu integer bench fast by default	2025-09-10 22:24:31 +02:00
pgardratzama	4ff0d6cac2	feat(hpu): integer bench update (adds mod, div -> div_mod), erc20_simd simd batch size read from iop prototype	2025-09-10 22:24:31 +02:00
pgardratzama	1530f52c79	feat(hpu): adds support of ERC20 SIMD in hpu ERC20 bench	2025-09-10 22:24:31 +02:00
David Testé	9918dacd6a	chore(ci): change workflow jobs naming convention The term "bpr" means Branch Protection Rule. It helps one to identify any job that must pass before being able to merge to the base branch.	2025-09-10 15:36:45 +02:00
tmontaigu	2b503acf18	chore(shortint): add consts for MetaParameters	2025-09-10 15:15:06 +02:00
tmontaigu	57cc326a64	feat(shortint): MetaParameters struct There are a lot of different parameter types in tfhe-rs, related to different but linked features. Thus when some PBS parameters are selected, compatible compression parameters must be selected from the possible parameters. To make things easier, the MetaParameters struct is added, this stores in one place parameters that can be used together.	2025-09-10 15:15:06 +02:00
Arthur Meyre	84eb8aeb63	test(shortint): add BR + DP + KS + MS noise checks - sanity check, noise measurement and pfail are done	2025-09-10 14:50:28 +02:00
Arthur Meyre	f09acfa581	chore: rename test files to remove redundant name fragment	2025-09-10 14:50:28 +02:00
Arthur Meyre	8335a6b6b5	chore(ci): run backward compat tests on merge to main - this is to prime cache and check backward data on merge to main	2025-09-10 14:49:50 +02:00
tmontaigu	f80fd157ae	fix(c-api): add missing safe_deser for ServerKey	2025-09-10 13:40:44 +02:00
Agnes Leroy	0ed97cfba8	chore(gpu): update sxm5 cost	2025-09-10 10:49:25 +02:00
Agnes Leroy	daee3f1850	chore(gpu): fix out of memory error in 4090 doc tests	2025-09-10 10:46:04 +02:00
tmontaigu	e8dc403ebd	feat(integer): add flip operation Add the flip(condition: BooleanBlock, a: T, b: T) -> (T, T) operation that homomorphically flip/swap two values if the given encrypted boolean encrypts true	2025-09-10 09:44:28 +02:00
Pedro Alves	63e5504c80	doc(gpu): add a section about noise squashing	2025-09-09 13:10:23 -03:00
Nicolas Sarlin	d664e4ada6	docs(safe_ser): document panics if max size is too large	2025-09-09 17:03:23 +02:00
Pedro Alves	c78cc2d2e9	chore(gpu): add a benchmark for 128-bit multi-bit noise squashing - Also, remove the lut indexes concept from the 128-bit multi-bit pbs. It's assumed not to exist by the entire backend (as it doesn't for classical PBS). So to keep it here would be a bit error prone.	2025-09-09 07:51:35 -03:00
Pedro Alves	b566d78621	chore(gpu): improve the 128-bit multi-bit PBS core crypto test	2025-09-09 07:51:35 -03:00
Pedro Alves	7da6786d59	feat(gpu): add support to the 128-bit multi-bit PBS on HL's noise squashing	2025-09-09 07:51:35 -03:00
Himess	6edf6b9e26	chore: gate backward_compatibility_tests.rs with shortint feature	2025-09-09 09:35:59 +02:00
Himess	6fde90ad9c	chore(clap): Replace use of deprecated attributes Replace deprecated #[clap(...)] attributes to #[arg]/#[command] and remove redundant use of value_parser	2025-09-09 09:35:59 +02:00
Agnes Leroy	5d70ae4232	fix(gpu): add missing broadcast lut	2025-09-09 08:47:53 +02:00
David Testé	89b36ebca0	chore(bench): remove 2-bits size for full precision bench on gpu GPU backend cannot accept less than 2 blocks for integer benchmarks. Since 2-bits precision benchmarks are run with _MESSAGE_2_CARRY_2_ parameters, it will create only one block of ciphertext, thus making the benchmarks unsuitable for GPU backend.	2025-09-08 12:24:24 +02:00
dependabot[bot]	bfc97385f4	chore(deps): bump actions/stale from 9.1.0 to 10.0.0 Bumps [actions/stale](https://github.com/actions/stale) from 9.1.0 to 10.0.0. - [Release notes](https://github.com/actions/stale/releases) - [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md) - [Commits](`5bef64f19d...3a9db7e6a4`) --- updated-dependencies: - dependency-name: actions/stale dependency-version: 10.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-08 10:39:56 +02:00
dependabot[bot]	7ab763abba	chore(deps): bump codecov/codecov-action from 5.5.0 to 5.5.1 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.5.0 to 5.5.1. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`fdcc847654...5a1091511a`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-version: 5.5.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-08 10:39:41 +02:00
dependabot[bot]	a05db18ba3	chore(deps): bump actions/setup-node from 4.4.0 to 5.0.0 Bumps [actions/setup-node](https://github.com/actions/setup-node) from 4.4.0 to 5.0.0. - [Release notes](https://github.com/actions/setup-node/releases) - [Commits](`49933ea528...a0853c2454`) --- updated-dependencies: - dependency-name: actions/setup-node dependency-version: 5.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-08 10:39:27 +02:00
Guillermo Oyarzun	a3168eb1b5	feat(gpu): enable lut generation with preallocated buffers	2025-09-08 10:01:34 +02:00
Arthur Meyre	7fccb851d7	fix(csprng): harmonize behavior for UnixSeeder between small and big endian - bytes are generated in a given order and endianness needs to be given to the buffer for the generated number to make sense - Seed(pub u128) exposes that endianness so it needs to be consistent to outside users	2025-09-08 09:40:00 +02:00
Arthur Meyre	a78d5cc57b	fix(csprng): make Seed interface less confusing wrt endianness - From a user perspective giving the same u128 seed e.g. 1u128 should have the same behavior no matter the endianness of the system	2025-09-08 09:40:00 +02:00
Nicolas Sarlin	9c0d078e1a	chore(zk): bump tfhe-zk-pok to 0.7.2	2025-09-08 09:30:34 +02:00
Nicolas Sarlin	6016755f9d	fix(js): bump wasm bindgen version	2025-09-05 17:55:33 +02:00