wip: measure latencies of a list of cts

WIP: fix gpu streams and use iter_batched
chore(bench): new heuristic to define elements for throughput
2026-04-28 03:01:21 -04:00 · 2025-01-28 11:19:27 +01:00 · 2025-01-22 10:56:08 +01:00 · 2025-01-20 15:21:05 +01:00 · 2025-01-20 10:59:12 +01:00 · 2025-01-20 10:00:49 +01:00
413 changed files with 20389 additions and 8181 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -8,8 +8,14 @@ root = true
 end_of_line = lf
 insert_final_newline = true

-# 4 space indentation
-[*.rs]
+# 4 space indentation for rust and toml
+[*.{rs,toml}]
 charset = utf-8
 indent_style = space
 indent_size = 4
+
+# 2 for c and js
+[*.{js,json,c,h}]
+charset = utf-8
+indent_style = space
+indent_size = 2
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -5,6 +5,7 @@ self-hosted-runner:
    - 4090-desktop
    - large_windows_16_latest
    - large_ubuntu_16
+    - large_ubuntu_16-22.04
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -76,7 +76,7 @@ jobs:
        with:
          persist-credentials: 'false'
          repository: zama-ai/tfhe-backward-compat-data
-          path: tfhe/tfhe-backward-compat-data
+          path: tests/tfhe-backward-compat-data
          lfs: 'true'
          ref: ${{ steps.backward_compat_branch.outputs.branch }}

@@ -94,7 +94,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (backward-compat-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, backward-compat-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -11,26 +11,30 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  MSG_MINIMAL: event,action url,commit
+  BRANCH: ${{ github.head_ref || github.ref }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
+  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
+  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
+  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
+    paths:
+      - '.github/**'
+      - 'ci/**'
+  # General entry point for Zama's pull request as well as contribution from forks.
  pull_request_target:
+    paths:
+      - '**'
+      - '!.github/**'
+      - '!ci/**'

 jobs:
-  check-user-permission:
-    if: github.event_name == 'pull_request_target'
-    uses: ./.github/workflows/check_triggering_actor.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  should-run:
    runs-on: ubuntu-latest
-    needs: check-user-permission
-    if: github.event_name != 'pull_request_target' ||
-      needs.check-user-permission.result == 'success'
    permissions:
      pull-requests: write
    outputs:
@@ -58,14 +62,15 @@ jobs:
      user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.user_docs_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
+      ci_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ci_any_changed }}
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-          ref: ${{ github.event.pull_request.head.sha }}
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          ref: ${{ env.REF }}

      - name: Check for file changes
        id: changed-files
@@ -114,11 +119,15 @@ jobs:
            user_docs:
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - README.md
+            ci:
+              - .github/**
+              - ci/**

      - name: Aggregate file changes
        id: aggregated-changes
+        # CI files are not included in this aggregator.
        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
@@ -133,11 +142,21 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

+  # Fail if the triggering actor is not part of Zama organization.
+  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
+  check-user-permission:
+    needs: should-run
+    if: github.event_name != 'pull_request_target' ||
+      (github.event_name == 'pull_request_target' && needs.should-run.outputs.ci_file_changed == 'false')
+    uses: ./.github/workflows/check_triggering_actor.yml
+    secrets:
+      TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
  setup-instance:
    name: Setup instance (fast-tests)
-    if: github.event_name != 'pull_request' ||
-      needs.should-run.outputs.any_file_changed == 'true'
-    needs: should-run
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: [ should-run, check-user-permission ]
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -155,8 +174,6 @@ jobs:

  fast-tests:
    name: Fast CPU tests
-    if: github.event_name != 'pull_request' ||
-      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    needs: [ should-run, setup-instance ]
    concurrency:
      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
@@ -167,8 +184,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-          ref: ${{ github.event.pull_request.head.sha }}
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          ref: ${{ env.REF }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -272,11 +289,11 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (fast-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, fast-tests ]
    runs-on: ubuntu-latest
    steps:
@@ -296,4 +313,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -134,7 +134,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (unsigned-integer-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, unsigned-integer-tests]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -138,7 +138,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (signed-integer-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [setup-instance, signed-integer-tests]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -111,7 +111,7 @@ jobs:
            user_docs:
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - README.md

      - name: Aggregate file changes
@@ -244,7 +244,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cpu-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cpu-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -113,7 +113,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (wasm-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, wasm-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -93,7 +93,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
@@ -121,7 +121,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (boolean-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, boolean-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -3,6 +3,9 @@ name: Core crypto benchmarks

 on:
  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 5a.m.
+    - cron: '0 5 * * 6'

 env:
  CARGO_TERM_COLOR: always
@@ -81,7 +84,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -109,7 +112,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (core-crypto-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, core-crypto-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -97,7 +97,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_erc20
          path: ${{ env.RESULTS_FILENAME }}
@@ -118,7 +118,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (erc20-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, erc20-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -82,7 +82,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -157,7 +157,7 @@ jobs:
      

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -98,7 +98,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -131,7 +131,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -124,7 +124,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_erc20_${{ inputs.profile }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -157,7 +157,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_gpu_integer_common.yml
+++ b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -200,7 +200,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -233,7 +233,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -170,7 +170,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -191,7 +191,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (integer-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, integer-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -136,7 +136,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -157,7 +157,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (shortint-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, shortint-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -164,7 +164,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -185,7 +185,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (integer-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, signed-integer-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -84,7 +84,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -84,7 +84,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_ntt
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -121,7 +121,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_tfhe_zk_pok
          path: ${{ env.RESULTS_FILENAME }}
@@ -149,7 +149,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (tfhe-zk-pok-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, tfhe-zk-pok-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -166,7 +166,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -194,7 +194,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (wasm-client-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, wasm-client-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -177,7 +177,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}
@@ -205,7 +205,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (pke-zk-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, pke-zk-benchmarks ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -12,7 +12,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  cargo-builds:
+  cargo-builds-ntt:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -12,7 +12,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  cargo-tests:
+  cargo-tests-fft:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
@@ -38,7 +38,7 @@ jobs:
        run: |
          make test_fft_no_std

-  cargo-tests-nightly:
+  cargo-tests-fft-nightly:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
@@ -60,7 +60,7 @@ jobs:
        run: |
          make test_fft_no_std_nightly

-  cargo-tests-node-js:
+  cargo-tests-fft-node-js:
    runs-on: "ubuntu-latest"
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -12,7 +12,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  cargo-tests:
+  cargo-tests-ntt:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
@@ -33,7 +33,7 @@ jobs:
      - name: Test no-std
        run: make test_ntt_no_std

-  cargo-tests-nightly:
+  cargo-tests-ntt-nightly:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
--- a/.github/workflows/check_triggering_actor.yml
+++ b/.github/workflows/check_triggering_actor.yml
@@ -13,7 +13,7 @@ jobs:
    steps:
      - name: Get User Permission
        id: check-access
-        uses: actions-cool/check-user-permission@956b2e73cdfe3bcb819bb7225e490cb3b18fd76e # v2.2.1
+        uses: actions-cool/check-user-permission@7b90a27f92f3961b368376107661682c441f6103 # v2.3.0
        with:
          require: write
          username: ${{ github.triggering_actor }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -27,7 +27,7 @@ jobs:
          make lint_workflow

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6ae615f6475d2ede5ad88bea6baa7a1d5e93ffaa # v3.0.19
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@c3a2b64f69b7a1542a68f44d9edbd9ec3fc1455e # v3.0.20
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -115,7 +115,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (code-coverage)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, code-coverage ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -69,7 +69,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (csprng-randomness-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, csprng-randomness-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -51,7 +51,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_fast_h100_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -149,7 +149,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -50,7 +50,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_fast_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -147,7 +147,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -51,7 +51,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/**_multi_gpu_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -152,7 +152,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-tests-multi-gpu)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -92,7 +92,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (gpu-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -104,7 +104,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-pcc)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-pcc ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -51,7 +51,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -132,7 +132,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-signed-classic-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -51,7 +51,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -132,7 +132,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -58,7 +58,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -150,7 +150,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -51,7 +51,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -132,7 +132,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-unsigned-classic-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -51,7 +51,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -132,7 +132,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -57,7 +57,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**.md'
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
@@ -146,7 +146,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cuda-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -72,7 +72,7 @@ jobs:

  teardown-instance:
    name: Teardown instance (cpu-tests)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cpu-tests ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -43,14 +43,14 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe
-      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
        with:
          name: crate
          path: target/package/*.crate
@@ -77,12 +77,9 @@ jobs:
    name: Publish Release
    needs: [package] # for comparing hashes
    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
@@ -110,7 +107,7 @@ jobs:
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: failure
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
@@ -155,7 +152,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -1,49 +0,0 @@
-name: Publish tfhe-csprng release
-
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true
-
-env:
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-
-jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
-    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
-      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
-
-  publish_release:
-    name: Publish tfhe-csprng Release
-    needs: verify_tag
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Publish crate.io package
-        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
-        run: |
-          cargo publish -p tfhe-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -1,4 +1,3 @@
-# Publish new release of tfhe-cuda-backend on crates.io.
 name: Publish CUDA release

 on:
@@ -8,10 +7,6 @@ on:
        description: "Dry-run"
        type: boolean
        default: true
-      push_to_crates:
-        description: "Push to crate"
-        type: boolean
-        default: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -45,10 +40,12 @@ jobs:
          backend: aws
          profile: gpu-build

-  publish-cuda-release:
-    name: Publish CUDA Release
+  package:
+    name: Package CUDA Release for provenance
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -61,7 +58,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
@@ -95,26 +92,75 @@ jobs:
            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-cuda-backend
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+  publish-cuda-release:
+    name: Publish CUDA Release
+    needs: [setup-instance, package] # for comparing hashes
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    steps:
      - name: Publish crate.io package
-        if: ${{ inputs.push_to_crates }}
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          cargo publish -p tfhe-cuda-backend --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}

+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (publish-release)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, publish-cuda-release ]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -0,0 +1,103 @@
+name: Publish tfhe-csprng release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  package:
+    runs-on: ubuntu-latest
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-csprng
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+        with:
+          name: crate-tfhe-csprng
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+
+  publish_release:
+    name: Publish tfhe-csprng Release
+    needs: [verify_tag, package]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+      - name: Download artifact
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: crate-tfhe-csprng
+          path: target/package
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-csprng - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -19,15 +19,53 @@ jobs:
      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

+  package:
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-fft
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        with:
+          name: crate
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
  publish_release:
    name: Publish tfhe-fft Release
    runs-on: ubuntu-latest
-    needs: verify_tag
+    needs: [verify_tag, package] # for comparing hashes
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Publish crate.io package
        env:
@@ -36,10 +74,26 @@ jobs:
        run: |
          cargo publish -p tfhe-fft --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}

+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-fft crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -19,13 +19,50 @@ jobs:
      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

+  package:
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-ntt
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+        with:
+          name: crate
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
  publish_release:
    name: Publish tfhe-ntt Release
    runs-on: ubuntu-latest
-    needs: verify_tag
+    needs: [verify_tag, package] # for comparing hashes
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0

@@ -36,10 +73,26 @@ jobs:
        run: |
          cargo publish -p tfhe-ntt --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}

+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-ntt crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -18,35 +18,159 @@ jobs:
      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

-  publish_release:
+  package-derive:
+    runs-on: ubuntu-latest
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-versionable-derive
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+        with:
+          name: crate-tfhe-versionable-derive
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance-derive:
+    needs: [package-derive]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package-derive.outputs.hash }}
+
+  publish_release-derive:
    name: Publish tfhe-versionable Release
-    needs: verify_tag
+    needs: [verify_tag, package-derive] # for comparing hashes
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Publish proc-macro crate
+      - name: Download artifact
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: crate-tfhe-versionable-derive
+          path: target/package
+      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package-derive.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-versionable-derive - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-versionable-derive release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

-      - name: Publish main crate
-        if: ${{ ! inputs.dry_run }}
+  package:
+    runs-on: ubuntu-latest
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-versionable
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+        with:
+          name: crate-tfhe-versionable
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance:
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+  publish_release:
+    name: Publish tfhe-versionable Release
+    needs: [package] # for comparing hashes
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
+        with:
+          fetch-depth: 0
+      - name: Download artifact
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: crate-tfhe-versionable
+          path: target/package
+      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
        run: |
-          cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
+          cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-versionable - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -1,4 +1,3 @@
-# Publish new release of tfhe-zk-pok on crates.io.
 name: Publish tfhe-zk-pok release

 on:
@@ -13,6 +12,40 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

 jobs:
+  package:
+      runs-on: ubuntu-latest
+      outputs:
+        hash: ${{ steps.hash.outputs.hash }}
+      steps:
+        - name: Checkout
+          uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+          with:
+            fetch-depth: 0
+        - name: Prepare package
+          run: |
+            cargo package -p tfhe-zk-pok
+        - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+          with:
+            name: crate-zk-pok
+            path: target/package/*.crate
+        - name: generate hash
+          id: hash
+          run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
  verify_tag:
    uses: ./.github/workflows/verify_tagged_commit.yml
    secrets:
@@ -21,26 +54,43 @@ jobs:

  publish_release:
    name: Publish tfhe-zk-pok Release
-    needs: verify_tag
+    needs: [verify_tag, package] # for comparing hashes
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
+      - name: Download artifact
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: crate-zk-pok
+          path: target/package
      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          cargo publish -p tfhe-zk-pok --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
-
+      - name: Verify hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe-zk-pok crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -14,7 +14,7 @@ on:

 jobs:
  params-curves-security-check:
-    runs-on: large_ubuntu_16
+    runs-on: large_ubuntu_16-22.04
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
--- a/.gitignore
+++ b/.gitignore
@@ -33,4 +33,4 @@ node_modules/
 package-lock.json

 # Dir used for backward compatibility test data
-tfhe/tfhe-backward-compat-data/
+tests/tfhe-backward-compat-data/
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -1,11 +1,15 @@
 ignore:
  - .git
  - target
+  - tfhe/build
+  - venv
+  - web-test-runner
  - tfhe/benchmarks_parameters
  - tfhe/web_wasm_parallel_tests/node_modules
  - tfhe/web_wasm_parallel_tests/dist
  - keys
  - coverage
+  - utils/tfhe-lints/ui/main.stderr

 rules:
  # checks if file ends in a newline character
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,24 +11,21 @@ members = [
    "backends/tfhe-cuda-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
+    "tests",
 ]

-exclude = [
-    "tfhe/backward_compatibility_tests",
-    "utils/cargo-tfhe-lints-inner",
-    "utils/cargo-tfhe-lints"
-]
+exclude = ["tests/backward_compatibility_tests", "utils/tfhe-lints"]
 [workspace.dependencies]
 aligned-vec = { version = "0.6", default-features = false }
 bytemuck = "1.14.3"
 dyn-stack = { version = "0.11", default-features = false }
-itertools = "0.13"
+itertools = "0.14"
 num-complex = "0.4"
-pulp = { version = "0.20.0", default-features = false }
+pulp = { version = "0.20", default-features = false }
 rand = "0.8"
 rayon = "1"
 serde = { version = "1.0", default-features = false }
-wasm-bindgen = ">=0.2.86,<0.2.94"
+wasm-bindgen = "0.2.100"

 [profile.bench]
 lto = "fat"
@@ -46,3 +43,6 @@ inherits = "dev"
 opt-level = 3
 lto = "off"
 debug-assertions = false
+
+[workspace.metadata.dylint]
+libraries = [{ path = "utils/tfhe-lints" }]
--- a/93
+++ b/93
@@ -20,7 +20,7 @@ BENCH_OP_FLAVOR?=DEFAULT
 BENCH_TYPE?=latency
 NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=v0.4
+BACKWARD_COMPAT_DATA_BRANCH?=v0.5
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
@@ -117,7 +117,7 @@ install_wasm_bindgen_cli: install_rs_build_toolchain
 .PHONY: install_wasm_pack # Install wasm-pack to build JS packages
 install_wasm_pack: install_rs_build_toolchain
 	@wasm-pack --version | grep "$(WASM_PACK_VERSION)" > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@0.13.1 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@$(WASM_PACK_VERSION) || \
 	( echo "Unable to install cargo wasm-pack, unknown error." && exit 1 )

 .PHONY: install_node # Install last version of NodeJS via nvm
@@ -151,10 +151,9 @@ install_tarpaulin: install_rs_build_toolchain
 	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-tarpaulin --locked || \
 	( echo "Unable to install cargo tarpaulin, unknown error." && exit 1 )

-.PHONY: install_tfhe_lints # Install custom tfhe-rs lints
-install_tfhe_lints:
-	(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
-	cd utils/cargo-tfhe-lints && cargo install --path .
+.PHONY: install_cargo_dylint # Install custom tfhe-rs lints
+install_cargo_dylint:
+	cargo install cargo-dylint dylint-link

 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker: install_rs_build_toolchain
@@ -243,7 +242,8 @@ fmt_js: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt
+	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt && \
+	$(MAKE) -C tfhe/js_on_wasm_tests fmt

 .PHONY: fmt_gpu # Format rust and cuda code
 fmt_gpu: install_rs_check_toolchain
@@ -272,7 +272,8 @@ check_fmt_js: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
+	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt && \
+	$(MAKE) -C tfhe/js_on_wasm_tests check_fmt

 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
@@ -281,14 +282,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p $(TFHE_SPEC)

@@ -393,10 +394,10 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
@@ -416,10 +417,15 @@ clippy_versionable: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-versionable -- --no-deps -D warnings

+.PHONY: clippy_tfhe_lints # Run clippy lints on tfhe-lints
+clippy_tfhe_lints: install_cargo_dylint # the toolchain is selected with toolchain.toml
+	cd utils/tfhe-lints && \
+	cargo clippy --all-targets -- --no-deps -D warnings
+
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable
+clippy_versionable clippy_tfhe_lints

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -435,13 +441,13 @@ check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
 	git diff --quiet HEAD -- backends/tfhe-cuda-backend/src/bindings.rs || \
 	( echo "Generated bindings have changed! Please run 'git add backends/tfhe-cuda-backend/src/bindings.rs' \
-	and commit the changes." && exit 1 ) 
+	and commit the changes." && exit 1 )


 .PHONY: tfhe_lints # Run custom tfhe-rs lints
-tfhe_lints: install_tfhe_lints
-	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
-		--features=boolean,shortint,integer,zk-pok -- -D warnings
+tfhe_lints: install_cargo_dylint
+	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
+		--features=boolean,shortint,integer,strings,zk-pok

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -515,11 +521,11 @@ build_web_js_api: install_rs_build_toolchain install_wasm_pack
 build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 	cd tfhe && \
 	rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
-	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
+	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory" rustup run $(RS_CHECK_TOOLCHAIN) \
 		wasm-pack build --release --target=web \
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
 		-Z build-std=panic_abort,std && \
-	find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
+	find pkg/snippets -type f -iname workerHelpers.js -exec sed -i "s|const pkg = await import('..\/..\/..');|const pkg = await import('..\/..\/..\/tfhe.js');|" {} \;
 	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json

 .PHONY: build_node_js_api # Build the js API targeting nodejs
@@ -806,7 +812,7 @@ test_integer_cov: install_rs_check_toolchain install_tarpaulin
 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok -p $(TFHE_SPEC) \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p $(TFHE_SPEC) \
 		-- high_level_api::

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
@@ -824,7 +830,7 @@ test_strings: install_rs_build_toolchain
 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok,strings \
 		-p $(TFHE_SPEC) \
 		-- test_user_docs::

@@ -887,16 +893,21 @@ test_versionable: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--all-targets -p tfhe-versionable

+.PHONY: test_tfhe_lints # Run test on tfhe-lints
+test_tfhe_lints: install_cargo_dylint
+	cd utils/tfhe-lints && \
+	cargo test
+
 # The backward compat data repo holds historical binary data but also rust code to generate and load them.
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
-		--features=shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
+		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
+		--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
-test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
+test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci

 .PHONY: backward_compat_branch # Prints the required backward compatibility branch
 backward_compat_branch:
@@ -1045,35 +1056,35 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1081,7 +1092,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1089,7 +1100,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1097,7 +1108,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1105,14 +1116,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
-	--features=integer,internal-keycache,zk-pok,nightly-avx512 \
+	--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
 	-p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
@@ -1275,9 +1286,9 @@ write_params_to_file: install_rs_check_toolchain

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
 clone_backward_compat_data:
-	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
+	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)

-tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
+tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data

 #
 # Real use case examples
@@ -1303,9 +1314,7 @@ sha256_bool: install_rs_check_toolchain

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
-clippy_all check_compile_tests
-# TFHE lints deactivated as it's incompatible with 1.83 - temporary
-# tfhe_lints
+clippy_all check_compile_tests test_tfhe_lints tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change
@@ -1315,7 +1324,7 @@ fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_t
 check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
-conformance: fix_newline fmt
+conformance: fix_newline fmt fmt_js

 #=============================== FFT Section ==================================
 .PHONY: doc_fft # Build rust doc for tfhe-fft
@@ -1387,7 +1396,7 @@ test_fft_nightly: install_rs_check_toolchain
 .PHONY: test_fft_no_std
 test_fft_no_std: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
-		--no-default-features 
+		--no-default-features
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
 		--no-default-features \
 		--features=fft128
@@ -1481,7 +1490,7 @@ test_ntt_nightly: install_rs_check_toolchain
 .PHONY: test_ntt_no_std
 test_ntt_no_std: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt \
-		--no-default-features 
+		--no-default-features

 .PHONY: test_ntt_no_std_nightly
 test_ntt_no_std_nightly: install_rs_check_toolchain
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -95,7 +95,7 @@ fn main() {
 			val >>= 1;
 		}
 	}
-	
+
 	let output_0_63    = "F4CD954A717F26A7D6930830C4E7CF0819F80E03F25F342C64ADC66ABA7F8A8E6EAA49F23632AE3CD41A7BD290A0132F81C6D4043B6E397D7388F3A03B5FE358".to_string();

 	let cipher_key = key.map(|x| FheBool::encrypt(x, &client_key));
@@ -129,24 +129,36 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `PARAM_MESSAGE_1_CARRY_1_KS_PBS`). It uses a lower level API 
-of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run on the same 
-cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes 
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64`).
+It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
+on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.

 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::CastingKey;
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
+};
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
 use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
-	let config = ConfigBuilder::default().build();
+	let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
 	let (hl_client_key, hl_server_key) = generate_keys(config);
-	let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
-	let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));
+    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
+    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
+
+	let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+    let ksk = KeySwitchingKey::new(
+        (&client_key, Some(&server_key)),
+        (&underlying_ck, &underlying_sk),
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+    );

 	let key_string = "0053A6F94C9FF24598EB".to_string();
 	let mut key = [0; 80];
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,23 +1,28 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
+};
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
 use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -57,18 +62,20 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
 }

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -103,18 +110,20 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
 }

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,23 +1,28 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
+};
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
 use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -57,18 +62,20 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
 }

 pub fn trivium_shortint_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -103,18 +110,20 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
 }

 pub fn trivium_shortint_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,6 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
+};
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
 // commit fd6828f68711276c25f55e605935028f5e843f43
@@ -216,18 +219,20 @@ use tfhe::shortint::prelude::*;

 #[test]
 fn kreyvium_test_shortint_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,6 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
+};
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
 // file testvectors/trivium-80.80.test-vectors
@@ -352,18 +355,20 @@ use tfhe::shortint::prelude::*;

 #[test]
 fn trivium_test_shortint_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.6.0"
+version = "0.7.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -14,4 +14,4 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
 [build-dependencies]
 cmake = { version = "0.1" }
 pkg-config = { version = "0.3" }
-bindgen = "0.70.1"
+bindgen = "0.71"
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -65,7 +65,7 @@ template <typename Torus> struct int_decompression {
  Torus *tmp_extracted_lwe;
  uint32_t *tmp_indexes_array;

-  int_radix_lut<Torus> *carry_extract_lut;
+  int_radix_lut<Torus> *decompression_rescale_lut;

  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                    uint32_t gpu_count, int_radix_params encryption_params,
@@ -84,7 +84,7 @@ template <typename Torus> struct int_decompression {
      Torus lwe_accumulator_size = (compression_params.glwe_dimension *
                                        compression_params.polynomial_size +
                                    1);
-      carry_extract_lut = new int_radix_lut<Torus>(
+      decompression_rescale_lut = new int_radix_lut<Torus>(
          streams, gpu_indexes, gpu_count, encryption_params, 1,
          num_radix_blocks, allocate_gpu_memory);

@@ -97,18 +97,28 @@ template <typename Torus> struct int_decompression {
          num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
          gpu_indexes[0]);

-      // Carry extract LUT
-      auto carry_extract_f = [encryption_params](Torus x) -> Torus {
-        return x / encryption_params.message_modulus;
+      // Rescale is done using an identity LUT
+      // Here we do not divide by message_modulus
+      // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
+      // space, we want to keep the original 2 bits value in the 4 bits space,
+      // so we apply the identity and the encoding will rescale it for us.
+      auto decompression_rescale_f = [encryption_params](Torus x) -> Torus {
+        return x;
      };

-      generate_device_accumulator<Torus>(
-          streams[0], gpu_indexes[0], carry_extract_lut->get_lut(0, 0),
-          encryption_params.glwe_dimension, encryption_params.polynomial_size,
-          encryption_params.message_modulus, encryption_params.carry_modulus,
-          carry_extract_f);
+      auto effective_compression_message_modulus =
+          encryption_params.carry_modulus;
+      auto effective_compression_carry_modulus = 1;

-      carry_extract_lut->broadcast_lut(streams, gpu_indexes, 0);
+      generate_device_accumulator_with_encoding<Torus>(
+          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
+          encryption_params.glwe_dimension, encryption_params.polynomial_size,
+          effective_compression_message_modulus,
+          effective_compression_carry_modulus,
+          encryption_params.message_modulus, encryption_params.carry_modulus,
+          decompression_rescale_f);
+
+      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0);
    }
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -117,8 +127,8 @@ template <typename Torus> struct int_decompression {
    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
    cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);

-    carry_extract_lut->release(streams, gpu_indexes, gpu_count);
-    delete carry_extract_lut;
+    decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
+    delete decompression_rescale_lut;
  }
 };
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -38,6 +38,15 @@ void generate_device_accumulator_bivariate_with_factor(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
    uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor);
+
+template <typename Torus>
+void generate_device_accumulator_with_encoding(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_message_modulus, uint32_t input_carry_modulus,
+    uint32_t output_message_modulus, uint32_t output_carry_modulus,
+    std::function<Torus(Torus)> f);
+
 /*
 *  generate univariate accumulator (lut) for device pointer
 *    stream - cuda stream
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -5,45 +5,50 @@

 extern "C" {

-void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void const *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void const *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void const *lwe_array_in_1,
-                                       void const *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void const *lwe_array_in_1,
-                                       void const *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count);
+void cuda_negate_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+void cuda_negate_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);

 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *cleartext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *cleartext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_plaintext_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint64_t plaintext_in,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 }

 #endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -26,15 +26,6 @@ template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
  return BLOCK_SIZE_GEMM * THREADS_GEMM * 2 * sizeof(Torus);
 }

-__host__ inline bool can_use_pks_fast_path(uint32_t lwe_dimension,
-                                           uint32_t num_lwe,
-                                           uint32_t polynomial_size,
-                                           uint32_t level_count,
-                                           uint32_t glwe_dimension) {
-  // TODO: activate it back, fix tests and extend to level_count > 1
-  return false;
-}
-
 // Initialize decomposition by performing rounding
 // and decomposing one level of an array of Torus LWEs. Only
 // decomposes the mask elements of the incoming LWEs.
@@ -57,6 +48,8 @@ __global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
  // is lwe_dimension + 1, while for writing it is lwe_dimension
  auto read_val_idx = lwe_idx * (lwe_dimension + 1) + lwe_sample_idx;
  auto write_val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
+  auto write_state_idx =
+      num_lwe * lwe_dimension + lwe_idx * lwe_dimension + lwe_sample_idx;

  Torus a_i = lwe_in[read_val_idx];

@@ -64,6 +57,8 @@ __global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,

  Torus mod_b_mask = (1ll << base_log) - 1ll;
  lwe_out[write_val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
+  synchronize_threads_in_block();
+  lwe_out[write_state_idx] = state;
 }

 // Continue decomposiion of an array of Torus elements in place. Supposes
@@ -84,12 +79,16 @@ decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
    return;

  auto val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
+  auto state_idx = num_lwe * lwe_dimension + val_idx;

-  Torus state = buffer_in[val_idx];
+  Torus state = buffer_in[state_idx];
+  synchronize_threads_in_block();

  Torus mod_b_mask = (1ll << base_log) - 1ll;

  buffer_in[val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
+  synchronize_threads_in_block();
+  buffer_in[state_idx] = state;
 }

 // Multiply matrices A, B of size (M, K), (K, N) respectively
@@ -99,6 +98,10 @@ decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
 // BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
 // THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
 // BLOCK_SIZE_GEMM)-shaped tiles of values from B.
+//
+// This code is adapted by generalizing the 1d block-tiling
+// kernel from https://github.com/siboehm/SGEMM_CUDA
+// to any matrix dimension
 template <typename Torus, typename TorusVec>
 __global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
                      int stride_B, Torus *C) {
@@ -111,7 +114,6 @@ __global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
  const uint cRow = blockIdx.y;
  const uint cCol = blockIdx.x;

-  const uint totalResultsBlocktile = BM * BN;
  const int threadCol = threadIdx.x % BN;
  const int threadRow = threadIdx.x / BN;

@@ -152,7 +154,7 @@ __global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
    } else {
      Bs[innerRowB * BN + innerColB] = 0;
    }
-    __syncthreads();
+    synchronize_threads_in_block();

    // Advance blocktile for the next iteration of this loop
    A += BK;
@@ -168,7 +170,7 @@ __global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
      }
    }
-    __syncthreads();
+    synchronize_threads_in_block();
  }

  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
@@ -259,10 +261,6 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(

  // Optimization of packing keyswitch when packing many LWEs

-  if (level_count > 1) {
-    PANIC("Fast path PKS only supports level_count==1");
-  }
-
  cudaSetDevice(gpu_index);
  check_cuda_error(cudaGetLastError());

@@ -273,10 +271,11 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
  // buffer and the keyswitched GLWEs in the second half of the buffer. Thus the
  // scratch buffer for the fast path must determine the half-size of the
  // scratch buffer as the max between the size of the GLWE and the size of the
-  // LWE-mask
-  int memory_unit = glwe_accumulator_size > lwe_dimension
+  // LWE-mask times two (to keep both decomposition state and decomposed
+  // intermediate value)
+  int memory_unit = glwe_accumulator_size > lwe_dimension * 2
                        ? glwe_accumulator_size
-                        : lwe_dimension;
+                        : lwe_dimension * 2;

  // ping pong the buffer between successive calls
  // split the buffer in two parts of this size
@@ -309,7 +308,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
                 CEIL_DIV(num_lwes, BLOCK_SIZE_GEMM));
  dim3 threads_gemm(BLOCK_SIZE_GEMM * THREADS_GEMM);

-  auto stride_KSK_buffer = glwe_accumulator_size;
+  auto stride_KSK_buffer = glwe_accumulator_size * level_count;

  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
  tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
@@ -317,21 +316,20 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
      stride_KSK_buffer, d_mem_1);
  check_cuda_error(cudaGetLastError());

-  /*
-    TODO: transpose key to generalize to level_count > 1
+  auto ksk_block_size = glwe_accumulator_size;

-    for (int li = 1; li < level_count; ++li) {
-      decompose_vectorize_step_inplace<Torus, TorusVec>
-          <<<grid_decomp, threads_decomp, 0, stream>>>(
-              d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
-      check_cuda_error(cudaGetLastError());
+  for (int li = 1; li < level_count; ++li) {
+    decompose_vectorize_step_inplace<Torus, TorusVec>
+        <<<grid_decomp, threads_decomp, 0, stream>>>(
+            d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
+    check_cuda_error(cudaGetLastError());

-      tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size,
-    stream>>>( num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
-          fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
-      check_cuda_error(cudaGetLastError());
-    }
-  */
+    tgemm<Torus, TorusVec>
+        <<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
+            num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
+            fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
+    check_cuda_error(cudaGetLastError());
+  }

  // should we include the mask in the rotation ??
  dim3 grid_rotate(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -73,24 +73,13 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes) {

-  if (can_use_pks_fast_path(input_lwe_dimension, num_lwes,
-                            output_polynomial_size, level_count,
-                            output_glwe_dimension)) {
-    host_fast_packing_keyswitch_lwe_list_to_glwe<uint64_t, ulonglong4>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(glwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-        base_log, level_count, num_lwes);
-  } else
-    host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(glwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-        base_log, level_count, num_lwes);
+  host_fast_packing_keyswitch_lwe_list_to_glwe<uint64_t, ulonglong4>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in),
+      static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
+      input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
+      base_log, level_count, num_lwes);
 }

 void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -164,9 +164,11 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

-  int memory_unit = glwe_accumulator_size > lwe_dimension
+  // allocate at least LWE-mask times two: to keep both decomposition state and
+  // decomposed intermediate value
+  int memory_unit = glwe_accumulator_size > lwe_dimension * 2
                        ? glwe_accumulator_size
-                        : lwe_dimension;
+                        : lwe_dimension * 2;

  if (allocate_gpu_memory) {
    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
@@ -221,44 +223,6 @@ __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
  }
 }

-// public functional packing keyswitch for a batch of LWE ciphertexts
-//
-// Selects the input each thread is working on using the y-block index.
-//
-// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
-// different thread blocks at the x-axis to work on that input.
-template <typename Torus>
-__global__ void packing_keyswitch_lwe_list_to_glwe(
-    Torus *glwe_array_out, Torus const *lwe_array_in, Torus const *fp_ksk,
-    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    Torus *d_mem) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-  const int lwe_size = (lwe_dimension_in + 1);
-
-  const int input_id = blockIdx.y;
-  const int degree = input_id;
-
-  // Select an input
-  auto lwe_in = lwe_array_in + input_id * lwe_size;
-  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
-  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
-
-  // KS LWE to GLWE
-  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext<Torus>(
-      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
-      polynomial_size, base_log, level_count);
-
-  // P * x ^degree
-  auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
-  auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
-  polynomial_accumulate_monic_monomial_mul<Torus>(out_result, in_poly, degree,
-                                                  tid % polynomial_size,
-                                                  polynomial_size, 1, true);
-}
-
 /// To-do: Rewrite this kernel for efficiency
 template <typename Torus>
 __global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
@@ -276,52 +240,4 @@ __global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
  }
 }

-template <typename Torus>
-__host__ void host_packing_keyswitch_lwe_list_to_glwe(
-    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
-    Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
-    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_lwes) {
-
-  if (num_lwes > polynomial_size)
-    PANIC("Cuda error: too many LWEs to pack. The number of LWEs should be "
-          "smaller than "
-          "polynomial_size.")
-
-  cudaSetDevice(gpu_index);
-  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-
-  int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
-
-  dim3 grid(num_blocks, num_lwes);
-  dim3 threads(num_threads);
-
-  // The fast path of PKS uses the scratch buffer (d_mem) differently:
-  // it needs to store the decomposed masks in the first half of this buffer
-  // and the keyswitched GLWEs in the second half of the buffer. Thus the
-  // scratch buffer for the fast path must determine the half-size of the
-  // scratch buffer as the max between the size of the GLWE and the size of the
-  // LWE-mask
-  int memory_unit = glwe_accumulator_size > lwe_dimension_in
-                        ? glwe_accumulator_size
-                        : lwe_dimension_in;
-
-  auto d_mem = (Torus *)fp_ks_buffer;
-  auto d_tmp_glwe_array_out = d_mem + num_lwes * memory_unit;
-
-  // individually keyswitch each lwe
-  packing_keyswitch_lwe_list_to_glwe<Torus><<<grid, threads, 0, stream>>>(
-      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
-      glwe_dimension, polynomial_size, base_log, level_count, d_mem);
-  check_cuda_error(cudaGetLastError());
-
-  // accumulate to a single glwe
-  accumulate_glwes<Torus><<<num_blocks, threads, 0, stream>>>(
-      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
-      num_lwes);
-  check_cuda_error(cudaGetLastError());
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -117,21 +117,11 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  while (rem_lwes > 0) {
    auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);

-    if (can_use_pks_fast_path(
-            input_lwe_dimension, chunk_size, compression_params.polynomial_size,
-            compression_params.ks_level, compression_params.glwe_dimension)) {
-      host_fast_packing_keyswitch_lwe_list_to_glwe<Torus, ulonglong4>(
-          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-          compression_params.polynomial_size, compression_params.ks_base_log,
-          compression_params.ks_level, chunk_size);
-    } else {
-      host_packing_keyswitch_lwe_list_to_glwe<Torus>(
-          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-          compression_params.polynomial_size, compression_params.ks_base_log,
-          compression_params.ks_level, chunk_size);
-    }
+    host_fast_packing_keyswitch_lwe_list_to_glwe<Torus, ulonglong4>(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+        compression_params.polynomial_size, compression_params.ks_base_log,
+        compression_params.ks_level, chunk_size);

    rem_lwes -= chunk_size;
    lwe_subset += chunk_size * lwe_in_size;
@@ -311,7 +301,7 @@ __host__ void host_integer_decompress(
  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
  /// dimension to a big LWE dimension
  auto encryption_params = h_mem_ptr->encryption_params;
-  auto lut = h_mem_ptr->carry_extract_lut;
+  auto lut = h_mem_ptr->decompression_rescale_lut;
  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
  if (active_gpu_count == 1) {
    execute_pbs_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -627,26 +627,48 @@ void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
  std::rotate(buffer, buffer + mid, buffer + array_length);
 }

+/// Caller needs to ensure that the operation applied is coherent from an
+/// encoding perspective.
+///
+/// For example:
+///
+/// Input encoding has 2 bits and output encoding has 4 bits, applying the
+/// identity lut would map the following:
+///
+/// 0|00|xx -> 0|00|00
+/// 0|01|xx -> 0|00|01
+/// 0|10|xx -> 0|00|10
+/// 0|11|xx -> 0|00|11
+///
+/// The reason is the identity function is computed in the input space but the
+/// scaling is done in the output space, as there are more bits in the output
+/// space, the delta is smaller hence the apparent "division" happening.
 template <typename Torus>
-void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
-                           uint32_t polynomial_size, uint32_t message_modulus,
-                           uint32_t carry_modulus,
-                           std::function<Torus(Torus)> f) {
+void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension,
+                                         uint32_t polynomial_size,
+                                         uint32_t input_message_modulus,
+                                         uint32_t input_carry_modulus,
+                                         uint32_t output_message_modulus,
+                                         uint32_t output_carry_modulus,
+                                         std::function<Torus(Torus)> f) {

-  uint32_t modulus_sup = message_modulus * carry_modulus;
-  uint32_t box_size = polynomial_size / modulus_sup;
-  Torus delta = (1ul << 63) / modulus_sup;
+  uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
+  uint32_t output_modulus_sup = output_message_modulus * output_carry_modulus;
+  uint32_t box_size = polynomial_size / input_modulus_sup;
+  auto nbits = sizeof(Torus) * 8;
+  Torus output_delta =
+      (static_cast<Torus>(1) << (nbits - 1)) / output_modulus_sup;

  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];

  // This accumulator extracts the carry bits
-  for (int i = 0; i < modulus_sup; i++) {
+  for (int i = 0; i < input_modulus_sup; i++) {
    int index = i * box_size;
    for (int j = index; j < index + box_size; j++) {
      auto f_eval = f(i);
-      body[j] = f_eval * delta;
+      body[j] = f_eval * output_delta;
    }
  }

@@ -660,6 +682,16 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }

+template <typename Torus>
+void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
+                           uint32_t polynomial_size, uint32_t message_modulus,
+                           uint32_t carry_modulus,
+                           std::function<Torus(Torus)> f) {
+  generate_lookup_table_with_encoding(acc, glwe_dimension, polynomial_size,
+                                      message_modulus, carry_modulus,
+                                      message_modulus, carry_modulus, f);
+}
+
 template <typename Torus>
 void generate_many_lookup_table(
    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -668,7 +700,8 @@ void generate_many_lookup_table(

  uint32_t modulus_sup = message_modulus * carry_modulus;
  uint32_t box_size = polynomial_size / modulus_sup;
-  Torus delta = (1ul << 63) / modulus_sup;
+  auto nbits = sizeof(Torus) * 8;
+  Torus delta = (static_cast<Torus>(1) << (nbits - 1)) / modulus_sup;

  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

@@ -803,6 +836,32 @@ void generate_device_accumulator_bivariate_with_factor(
  free(h_lut);
 }

+template <typename Torus>
+void generate_device_accumulator_with_encoding(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_message_modulus, uint32_t input_carry_modulus,
+    uint32_t output_message_modulus, uint32_t output_carry_modulus,
+    std::function<Torus(Torus)> f) {
+
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill accumulator
+  generate_lookup_table_with_encoding<Torus>(
+      h_lut, glwe_dimension, polynomial_size, input_message_modulus,
+      input_carry_modulus, output_message_modulus, output_carry_modulus, f);
+
+  // copy host lut and lut_indexes_vec to device
+  cuda_memcpy_async_to_gpu(
+      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
+      stream, gpu_index);
+
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
+}
+
 /*
 *  generate accumulator for device pointer
 *    v_stream - cuda stream
@@ -818,21 +877,9 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                 uint32_t carry_modulus,
                                 std::function<Torus(Torus)> f) {

-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-  // fill accumulator
-  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
-                               message_modulus, carry_modulus, f);
-
-  // copy host lut and lut_indexes_vec to device
-  cuda_memcpy_async_to_gpu(
-      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      stream, gpu_index);
-
-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
+  generate_device_accumulator_with_encoding(
+      stream, gpu_index, acc, glwe_dimension, polynomial_size, message_modulus,
+      carry_modulus, message_modulus, carry_modulus, f);
 }

 /*
@@ -1055,7 +1102,8 @@ void host_compute_propagation_simulators_and_group_carries(
      message_modulus, carry_modulus);

  uint32_t modulus_sup = message_modulus * carry_modulus;
-  Torus delta = (1ull << 63) / modulus_sup;
+  auto nbits = sizeof(Torus) * 8;
+  Torus delta = (static_cast<Torus>(1) << (nbits - 1)) / modulus_sup;
  auto simulators = mem->simulators;
  auto grouping_pgns = mem->grouping_pgns;
  host_radix_split_simulators_and_grouping_pgns<Torus>(
@@ -1382,8 +1430,8 @@ __host__ void
 create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
                     Torus *lwe_array_out, Torus const *scalar_array,
                     uint32_t lwe_dimension, uint32_t num_radix_blocks,
-                     uint32_t num_scalar_blocks, uint64_t message_modulus,
-                     uint64_t carry_modulus) {
+                     uint32_t num_scalar_blocks, Torus message_modulus,
+                     Torus carry_modulus) {

  cudaSetDevice(gpu_index);
  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
@@ -1403,7 +1451,9 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
  // Value of the shift we multiply our messages by
  // If message_modulus and carry_modulus are always powers of 2 we can simplify
  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
+  auto nbits = sizeof(Torus) * 8;
+  Torus delta = (static_cast<Torus>(1) << (nbits - 1)) /
+                (message_modulus * carry_modulus);

  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -4,12 +4,11 @@
 * Perform the addition of two u32 input LWE ciphertext vectors.
 * See the equivalent operation on u64 ciphertexts for more details.
 */
-void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void const *lwe_array_in_1,
-                                       void const *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count) {
+void cuda_add_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
                          static_cast<uint32_t *>(lwe_array_out),
@@ -44,12 +43,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
 * vectors are left unchanged. This function is a wrapper to a device function
 * that performs the operation on the GPU.
 */
-void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void const *lwe_array_in_1,
-                                       void const *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count) {
+void cuda_add_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
                          static_cast<uint64_t *>(lwe_array_out),
@@ -65,7 +63,8 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_addition_plaintext<uint32_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
@@ -105,7 +104,8 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_addition_plaintext<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
@@ -114,3 +114,41 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
      static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
      input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext scalar.
+ * - `stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_in` is the plaintext used as input.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * The same input plaintext is added to the body of the
+ * LWE ciphertexts in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_add_lwe_ciphertext_vector_plaintext_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint64_t plaintext_in,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {
+
+  host_addition_plaintext_scalar<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
+      input_lwe_dimension, input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -13,9 +13,10 @@
 #include <stdio.h>

 template <typename T>
-__global__ void
-plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
-                   uint32_t input_lwe_dimension, uint32_t num_entries) {
+__global__ void plaintext_addition(T *output, T const *lwe_input,
+                                   T const *plaintext_input,
+                                   const uint32_t input_lwe_dimension,
+                                   const uint32_t num_entries) {

  int tid = threadIdx.x;
  int plaintext_index = blockIdx.x * blockDim.x + tid;
@@ -28,10 +29,26 @@ plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
 }

 template <typename T>
-__host__ void
-host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
-                        T const *lwe_input, T const *plaintext_input,
-                        uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
+__global__ void plaintext_addition_scalar(T *output, T const *lwe_input,
+                                          const T plaintext_input,
+                                          const uint32_t input_lwe_dimension,
+                                          const uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int lwe_index = blockIdx.x * blockDim.x + tid;
+  if (lwe_index < num_entries) {
+    int index = lwe_index * (input_lwe_dimension + 1) + input_lwe_dimension;
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] + plaintext_input;
+  }
+}
+
+template <typename T>
+__host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
+                                      T *output, T const *lwe_input,
+                                      T const *plaintext_input,
+                                      const uint32_t lwe_dimension,
+                                      const uint32_t lwe_ciphertext_count) {

  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
@@ -48,6 +65,27 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
  check_cuda_error(cudaGetLastError());
 }

+template <typename T>
+__host__ void host_addition_plaintext_scalar(
+    cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
+    const T plaintext_input, const uint32_t lwe_dimension,
+    const uint32_t lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cuda_memcpy_async_gpu_to_gpu(
+      output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
+      stream, gpu_index);
+  plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
+      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void addition(T *output, T const *input_1, T const *input_2,
                         uint32_t num_entries) {
@@ -64,8 +102,8 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
 template <typename T>
 __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
                            T const *input_1, T const *input_2,
-                            uint32_t input_lwe_dimension,
-                            uint32_t input_lwe_ciphertext_count) {
+                            const uint32_t input_lwe_dimension,
+                            const uint32_t input_lwe_ciphertext_count) {

  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -7,7 +7,8 @@
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *cleartext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_cleartext_vec_multiplication<uint32_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
@@ -47,7 +48,8 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *cleartext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_cleartext_vec_multiplication<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -16,8 +16,8 @@
 template <typename T>
 __global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
                                             T const *cleartext_input,
-                                             uint32_t input_lwe_dimension,
-                                             uint32_t num_entries) {
+                                             const uint32_t input_lwe_dimension,
+                                             const uint32_t num_entries) {

  int tid = threadIdx.x;
  int index = blockIdx.x * blockDim.x + tid;
@@ -31,8 +31,8 @@ __global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
 template <typename T>
 __host__ void host_cleartext_vec_multiplication(
    cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
-    T const *cleartext_input, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count) {
+    T const *cleartext_input, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -4,11 +4,10 @@
 * Perform the negation of a u32 input LWE ciphertext vector.
 * See the equivalent operation on u64 ciphertexts for more details.
 */
-void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void const *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count) {
+void cuda_negate_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
                          static_cast<uint32_t *>(lwe_array_out),
@@ -38,11 +37,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
 * LWE ciphertext vector is left unchanged. This function is a wrapper to a
 * device function that performs the operation on the GPU.
 */
-void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void const *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count) {
+void cuda_negate_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

  host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
                          static_cast<uint64_t *>(lwe_array_out),
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -23,8 +23,8 @@ __global__ void negation(T *output, T const *input, uint32_t num_entries) {

 template <typename T>
 __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
-                            T const *input, uint32_t input_lwe_dimension,
-                            uint32_t input_lwe_ciphertext_count) {
+                            T const *input, const uint32_t input_lwe_dimension,
+                            const uint32_t input_lwe_ciphertext_count) {

  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -480,30 +480,20 @@ __host__ void host_programmable_bootstrap(
  double2 *global_join_buffer = pbs_buffer->global_join_buffer;
  int8_t *d_mem = pbs_buffer->d_mem;

-  bool graphCreated = false;
-  cudaGraph_t graph;
-  cudaGraphExec_t instance;
  for (int i = 0; i < lwe_dimension; i++) {
-    if (!graphCreated) {
-      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
-      execute_step_one<Torus, params>(
-          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-          lwe_input_indexes, bootstrapping_key, global_accumulator,
-          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
-          glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
-      execute_step_two<Torus, params>(
-          stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, bootstrapping_key, global_accumulator,
-          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
-          glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-          partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
-          num_many_lut, lut_stride);
-      cudaStreamEndCapture(stream, &graph);
-      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
-      graphCreated = true;
-    }
-    cudaGraphLaunch(instance, stream);
+    execute_step_one<Torus, params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, global_accumulator,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+    execute_step_two<Torus, params>(
+        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, bootstrapping_key, global_accumulator,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
+        num_many_lut, lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -649,41 +649,29 @@ __host__ void host_multi_bit_programmable_bootstrap(

  auto lwe_chunk_size = buffer->lwe_chunk_size;

-  bool graphCreated = false;
-  cudaGraph_t graph;
-  cudaGraphExec_t instance;
-
  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {

-    if (!graphCreated) {
-      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
-      // Compute a keybundle
-      execute_compute_keybundle<Torus, params>(
-          stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-          buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, lwe_offset);
-      // Accumulate
-      uint32_t chunk_size = std::min(
-          lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-      for (uint32_t j = 0; j < chunk_size; j++) {
-        execute_step_one<Torus, params>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count, j,
-            lwe_offset);
+    // Compute a keybundle
+    execute_compute_keybundle<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+    // Accumulate
+    uint32_t chunk_size = std::min(
+        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+    for (uint32_t j = 0; j < chunk_size; j++) {
+      execute_step_one<Torus, params>(
+          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+          lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, j, lwe_offset);

-        execute_step_two<Torus, params>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-            grouping_factor, level_count, j, lwe_offset, num_many_lut,
-            lut_stride);
-      }
-      cudaStreamEndCapture(stream, &graph);
-      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
-      graphCreated = true;
+      execute_step_two<Torus, params>(
+          stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+          num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+          grouping_factor, level_count, j, lwe_offset, num_many_lut,
+          lut_stride);
    }
-    cudaGraphLaunch(instance, stream);
  }
 }
 #endif // MULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -237,7 +237,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
        (ClassicalProgrammableBootstrapTestParams){
            887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4,
            100, 1, 1},
-        // PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
+        // V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
        (ClassicalProgrammableBootstrapTestParams){
            977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07),
            new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1345,6 +1345,17 @@ extern "C" {
        input_lwe_ciphertext_count: u32,
    );
 }
+extern "C" {
+    pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        plaintext_in: u64,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+}
 extern "C" {
    pub fn cuda_fourier_polynomial_mul(
        stream: *mut ffi::c_void,
--- a/docker/Dockerfile.wasm_tests
+++ b/docker/Dockerfile.wasm_tests
@@ -1,40 +0,0 @@
-FROM ubuntu:22.04
-
-ENV TZ=Europe/Paris
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-# Replace default archive.ubuntu.com with fr mirror
-# original archive showed performance issues and is farther away
-RUN sed -i 's|^deb http://archive.ubuntu.com/ubuntu/|deb http://mirror.ubuntu.ikoula.com/|g' /etc/apt/sources.list && \
-    sed -i 's|^deb http://security.ubuntu.com/ubuntu/|deb http://mirror.ubuntu.ikoula.com/|g' /etc/apt/sources.list
-
-ENV CARGO_TARGET_DIR=/root/tfhe-rs-target
-
-ARG RUST_TOOLCHAIN="stable"
-ARG NODE_VERSION
-
-WORKDIR /tfhe-wasm-tests
-
-RUN apt-get update && \
-    apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    python3 \
-    python3-pip \
-    python3-venv && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > install-rustup.sh && \
-    chmod +x install-rustup.sh && \
-    ./install-rustup.sh -y --default-toolchain "${RUST_TOOLCHAIN}" \
-    -c rust-src -t wasm32-unknown-unknown && \
-    . "$HOME/.cargo/env" && \
-    cargo install wasm-pack && \
-    curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh > install-node.sh && \
-    chmod +x install-node.sh && \
-    ./install-node.sh && \
-    . "$HOME/.nvm/nvm.sh" && \
-    bash -i -c 'nvm install ${NODE_VERSION} && nvm use ${NODE_VERSION}'
-
-WORKDIR /tfhe-wasm-tests/tfhe-rs/
--- a/tasks/src/check_tfhe_docs_are_tested.rs
+++ b/tasks/src/check_tfhe_docs_are_tested.rs
@@ -7,10 +7,10 @@ const DIR_TO_IGNORE: [&str; 3] = [
    ".git",
    "target",
    // If the data repo has been cloned, we ignore its README
-    "tfhe/tfhe-backward-compat-data",
+    "tests/tfhe-backward-compat-data",
 ];

-const FILES_TO_IGNORE: [&str; 5] = [
+const FILES_TO_IGNORE: [&str; 6] = [
    // This contains fragments of code that are unrelated to TFHE-rs
    "tfhe/docs/tutorials/sha256_bool.md",
    // TODO: This contains code that could be executed as a trivium docstring
@@ -21,6 +21,7 @@ const FILES_TO_IGNORE: [&str; 5] = [
    "tfhe-fft/README.md",
    // TODO: find a way to test the tfhe-ntt readme
    "tfhe-ntt/README.md",
+    "utils/tfhe-lints/README.md",
 ];

 pub fn check_tfhe_docs_are_tested() -> Result<(), Error> {
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "tests"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dev-dependencies]
+tfhe = { path = "../tfhe" }
+tfhe-versionable = { path = "../utils/tfhe-versionable" }
+tfhe-backward-compat-data = { git = "https://github.com/zama-ai/tfhe-backward-compat-data.git", branch = "v0.5", default-features = false, features = [
+    "load",
+] }
+ron = "0.8"
+
+
+[[test]]
+name = "backward_compatibility_tests"
+path = "backward_compatibility_tests.rs"
+
+[features]
+shortint = ["tfhe/shortint"]
+integer = ["shortint", "tfhe/integer"]
+zk-pok = ["tfhe/zk-pok"]
--- a/tfhe/tests/backward_compatibility/high_level_api.rs
+++ b/tfhe/tests/backward_compatibility/high_level_api.rs
--- a/tfhe/tests/backward_compatibility/mod.rs
+++ b/tfhe/tests/backward_compatibility/mod.rs
--- a/tfhe/tests/backward_compatibility/shortint.rs
+++ b/tfhe/tests/backward_compatibility/shortint.rs
--- a/tfhe/tests/backward_compatibility_tests.rs
+++ b/tfhe/tests/backward_compatibility_tests.rs
--- a/tfhe-fft/Cargo.toml
+++ b/tfhe-fft/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-fft"
-version = "0.6.0"
+version = "0.7.0"
 edition = "2021"
 description = "tfhe-fft is a pure Rust high performance fast Fourier transform library."
 readme = "README.md"
@@ -43,7 +43,7 @@ getrandom = { version = "0.2", features = ["js"] }
 rug = "1.19.1"

 [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
-criterion = "0.4"
+criterion = "0.5"
 fftw-sys = { version = "0.6", default-features = false, features = ["system"] }

 [[bench]]
--- a/tfhe-ntt/Cargo.toml
+++ b/tfhe-ntt/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-ntt"
-version = "0.3.0"
+version = "0.4.0"
 edition = "2021"
 description = "tfhe-ntt is a pure Rust high performance number theoretic transform library."
 readme = "README.md"
@@ -22,7 +22,7 @@ std = ["pulp/std", "aligned-vec/std"]
 nightly = ["pulp/nightly"]

 [dev-dependencies]
-criterion = "0.4"
+criterion = "0.5"
 rand = { workspace = true }
 serde = "1.0.163"
 serde_json = "1.0.96"
--- a/tfhe-ntt/src/product.rs
+++ b/tfhe-ntt/src/product.rs
@@ -278,12 +278,12 @@ impl Plan {
        let ntt_32: &mut [u32] = bytemuck::cast_slice_mut(ntt_32);

        // optimize common cases(?): u64x1, u32x1
-        if self.plan_32.len() == 0 && self.plan_64.len() == 1 {
+        if self.plan_32.is_empty() && self.plan_64.len() == 1 {
            ntt_64.copy_from_slice(standard);
            self.plan_64[0].fwd(ntt_64);
            return;
        }
-        if self.plan_32.len() == 1 && self.plan_64.len() == 0 {
+        if self.plan_32.len() == 1 && self.plan_64.is_empty() {
            for (ntt, &standard) in ntt_32.iter_mut().zip(standard) {
                *ntt = standard as u32;
            }
@@ -291,7 +291,7 @@ impl Plan {
            return;
        }

-        if self.plan_32.len() == 2 && self.plan_64.len() == 0 {
+        if self.plan_32.len() == 2 && self.plan_64.is_empty() {
            let (ntt0, ntt1) = ntt_32.split_at_mut(self.ntt_size());
            let p0_div = self.plan_32[0].p_div();
            let p1_div = self.plan_32[1].p_div();
@@ -375,7 +375,7 @@ impl Plan {
        let ntt_64 = &*ntt_64;

        // optimize common cases(?): u64x1, u32x1, u32x2
-        if self.plan_32.len() == 0 && self.plan_64.len() == 0 {
+        if self.plan_32.is_empty() && self.plan_64.is_empty() {
            match mode {
                InvMode::Replace => standard.fill(0),
                InvMode::Accumulate => {}
@@ -383,7 +383,7 @@ impl Plan {
            return;
        }

-        if self.plan_32.len() == 0 && self.plan_64.len() == 1 {
+        if self.plan_32.is_empty() && self.plan_64.len() == 1 {
            match mode {
                InvMode::Replace => standard.copy_from_slice(ntt_64),
                InvMode::Accumulate => {
@@ -396,7 +396,7 @@ impl Plan {
            }
            return;
        }
-        if self.plan_32.len() == 1 && self.plan_64.len() == 0 {
+        if self.plan_32.len() == 1 && self.plan_64.is_empty() {
            match mode {
                InvMode::Replace => {
                    for (standard, &ntt) in standard.iter_mut().zip(ntt_32) {
@@ -416,7 +416,7 @@ impl Plan {

        // implements the algorithms from "the art of computer programming (Donald E. Knuth)" 4.3.2
        // for finding solutions of the chinese remainder theorem
-        if self.plan_32.len() == 2 && self.plan_64.len() == 0 {
+        if self.plan_32.len() == 2 && self.plan_64.is_empty() {
            let (ntt0, ntt1) = ntt_32.split_at(self.ntt_size());
            let p0 = self.plan_32[0].modulus();
            let p1 = self.plan_32[1].modulus();
--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-zk-pok"
-version = "0.3.1"
+version = "0.4.0"
 edition = "2021"
 keywords = ["zero", "knowledge", "proof", "vector-commitments"]
 homepage = "https://zama.ai/"
@@ -8,6 +8,7 @@ documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 license = "BSD-3-Clause-Clear"
 description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryption for TFHE."
+rust-version = "1.84"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

@@ -22,7 +23,7 @@ sha3 = "0.10.8"
 serde = { workspace = true, features = ["default", "derive"] }
 zeroize = "1.7.0"
 num-bigint = "0.4.5"
-tfhe-versionable = { version = "0.3.2", path = "../utils/tfhe-versionable" }
+tfhe-versionable = { version = "0.4.0", path = "../utils/tfhe-versionable" }

 [dev-dependencies]
 serde_json = "~1.0"
--- a/tfhe-zk-pok/src/backward_compatibility/mod.rs
+++ b/tfhe-zk-pok/src/backward_compatibility/mod.rs
@@ -11,7 +11,7 @@ use std::fmt::Display;
 use tfhe_versionable::{Upgrade, Version, VersionsDispatch};

 use crate::curve_api::Curve;
-use crate::four_squares::{isqrt, sqr};
+use crate::four_squares::sqr;
 use crate::proofs::pke_v2::Bound;
 use crate::proofs::GroupElements;
 use crate::serialization::{
@@ -102,7 +102,7 @@ impl Upgrade<SerializablePKEv2PublicParams> for SerializablePKEv2PublicParamsV0
    type Error = Infallible;

    fn upgrade(self) -> Result<SerializablePKEv2PublicParams, Self::Error> {
-        let slack_factor = isqrt((self.d + self.k) as u128) as u64;
+        let slack_factor = (self.d + self.k).isqrt() as u64;
        let B_inf = self.B / slack_factor;
        Ok(SerializablePKEv2PublicParams {
            g_lists: self.g_lists,
@@ -110,7 +110,7 @@ impl Upgrade<SerializablePKEv2PublicParams> for SerializablePKEv2PublicParamsV0
            n: self.n,
            d: self.d,
            k: self.k,
-            B_bound_squared: sqr(self.B_bound as u128),
+            B_bound_squared: sqr(self.B_bound),
            B_inf,
            q: self.q,
            t: self.t,
--- a/tfhe-zk-pok/src/four_squares.rs
+++ b/tfhe-zk-pok/src/four_squares.rs
@@ -1,7 +1,9 @@
 use ark_ff::biginteger::arithmetic::widening_mul;
 use rand::prelude::*;

-pub fn sqr(x: u128) -> u128 {
+/// Avoid overflows for squares of u64
+pub fn sqr(x: u64) -> u128 {
+    let x = x as u128;
    x * x
 }

@@ -9,37 +11,8 @@ pub fn checked_sqr(x: u128) -> Option<u128> {
    x.checked_mul(x)
 }

-// copied from the standard library
-// since isqrt is unstable at the moment
-pub fn isqrt(this: u128) -> u128 {
-    if this < 2 {
-        return this;
-    }
-
-    // The algorithm is based on the one presented in
-    // <https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_(base_2)>
-    // which cites as source the following C code:
-    // <https://web.archive.org/web/20120306040058/http://medialab.freaknet.org/martin/src/sqrt/sqrt.c>.
-
-    let mut op = this;
-    let mut res = 0;
-    let mut one = 1 << (this.ilog2() & !1);
-
-    while one != 0 {
-        if op >= res + one {
-            op -= res + one;
-            res = (res >> 1) + one;
-        } else {
-            res >>= 1;
-        }
-        one >>= 2;
-    }
-
-    res
-}
-
 fn half_gcd(p: u128, s: u128) -> u128 {
-    let sq_p = isqrt(p as _);
+    let sq_p = p.isqrt();
    let mut a = p;
    let mut b = s;
    while b > sq_p {
@@ -225,13 +198,13 @@ pub fn four_squares(v: u128) -> [u64; 4] {

    let f = v % 4;
    if f == 2 {
-        let b = isqrt(v as _) as u64;
+        let b = v.isqrt() as u64;

        'main_loop: loop {
            let x = 2 + rng.gen::<u64>() % (b - 2);
            let y = 2 + rng.gen::<u64>() % (b - 2);

-            let (sum, o) = u128::overflowing_add(sqr(x as u128), sqr(y as u128));
+            let (sum, o) = u128::overflowing_add(sqr(x), sqr(y));
            if o || sum > v {
                continue 'main_loop;
            }
@@ -288,9 +261,9 @@ pub fn four_squares(v: u128) -> [u64; 4] {
            let i = mont.natural_from_mont(sqrt);
            let i = if i <= p / 2 { p - i } else { i };
            let z = half_gcd(p, i) as u64;
-            let w = isqrt(p - sqr(z as u128)) as u64;
+            let w = (p - sqr(z)).isqrt() as u64;

-            if p != sqr(z as u128) + sqr(w as u128) {
+            if p != sqr(z) + sqr(w) {
                continue 'main_loop;
            }

--- a/tfhe-zk-pok/src/proofs/pke_v2.rs
+++ b/tfhe-zk-pok/src/proofs/pke_v2.rs
@@ -511,7 +511,7 @@ than the lwe dimension d. Please pick a smaller k: k = {k}, d = {d}"
            Bound::GHL => 950625,
            Bound::CS => 2 * (d as u128 + k as u128) + 4,
        })
-        .checked_mul(B_squared + (sqr((d + 2) as u128) * (d + k) as u128) / 4)
+        .checked_mul(B_squared + (sqr((d + 2) as u64) * (d + k) as u128) / 4)
        .unwrap_or_else(|| {
            panic!(
                "Invalid parameters for zk_pok, B_squared: {B_squared}, d: {d}, k: {k}. \
@@ -552,8 +552,9 @@ The computed m parameter is {m_bound} > 64. Please select a smaller B, d and/or
 /// Use the relationship: `||x||_2 <= sqrt(dim)*||x||_inf`. Since we are only interested in the
 /// squared bound, we avoid the sqrt by returning dim*(||x||_inf)^2.
 fn inf_norm_bound_to_euclidean_squared(B_inf: u64, dim: usize) -> u128 {
-    checked_sqr(B_inf as u128)
-        .and_then(|norm_squared| norm_squared.checked_mul(dim as u128))
+    let norm_squared = sqr(B_inf);
+    norm_squared
+        .checked_mul(dim as u128)
        .unwrap_or_else(|| panic!("Invalid parameters for zk_pok, B_inf: {B_inf}, d+k: {dim}"))
 }

@@ -765,7 +766,7 @@ fn prove_impl<G: Curve>(
    let e_sqr_norm = e1
        .iter()
        .chain(e2)
-        .map(|x| sqr(x.unsigned_abs() as u128))
+        .map(|x| sqr(x.unsigned_abs()))
        .sum::<u128>();

    if sanity_check_mode == ProofSanityCheckMode::Panic {
@@ -940,7 +941,7 @@ fn prove_impl<G: Curve>(
                assert!(
                    checked_sqr(acc.unsigned_abs()).unwrap() <= B_bound_squared,
                    "sqr(acc) ({}) > B_bound_squared ({B_bound_squared})",
-                    sqr(acc as u128)
+                    checked_sqr(acc.unsigned_abs()).unwrap()
                );
            }
            acc as i64
@@ -2786,7 +2787,7 @@ mod tests {
            };

            let B_with_slack_squared = inf_norm_bound_to_euclidean_squared(B, d + k);
-            let B_with_slack = isqrt(B_with_slack_squared) as u64;
+            let B_with_slack = B_with_slack_squared.isqrt() as u64;

            let bound = match slack_mode {
                // The slack is maximal, any term above B+slack should be refused
@@ -2797,7 +2798,7 @@ mod tests {
                        .e1
                        .iter()
                        .chain(&testcase.e2)
-                        .map(|x| sqr(x.unsigned_abs() as u128))
+                        .map(|x| sqr(x.unsigned_abs()))
                        .sum::<u128>();

                    let orig_value = match coeff_type {
@@ -2806,8 +2807,8 @@ mod tests {
                    };

                    let bound_squared =
-                        B_with_slack_squared - (e_sqr_norm - sqr(orig_value as u128));
-                    isqrt(bound_squared) as i64
+                        B_with_slack_squared - (e_sqr_norm - sqr(orig_value as u64));
+                    bound_squared.isqrt() as i64
                }
                // There is no slack effect, any term above B should be refused
                BoundTestSlackMode::Min => B as i64,
@@ -2849,7 +2850,7 @@ mod tests {
        let crs_max_k = crs_gen::<Curve>(d, d, B, q, t, msbs_zero_padding_bit_count, rng);

        let B_with_slack_squared = inf_norm_bound_to_euclidean_squared(B, d + k);
-        let B_with_slack_upper = isqrt(B_with_slack_squared) as u64 + 1;
+        let B_with_slack_upper = B_with_slack_squared.isqrt() as u64 + 1;

        // Generate test noise vectors with random coeffs and one completely out of bounds

--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -17,7 +17,7 @@ exclude = [
    "/js_on_wasm_tests/",
    "/web_wasm_parallel_tests/",
 ]
-rust-version = "1.83"
+rust-version = "1.84"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

@@ -32,38 +32,35 @@ serde_json = "1.0.94"
 clap = { version = "=4.4.4", features = ["derive"] }
 # Used in user documentation
 fs2 = { version = "0.4.3" }
-statrs = "0.16"
+statrs = "0.18"
 # For erf and normality test
 libm = "0.2.6"
 # Begin regex-engine deps
 test-case = "3.1.0"
 combine = "4.6.6"
-env_logger = "0.10.0"
+env_logger = "0.11"
 log = "0.4.19"
 hex = "0.4.3"
 # End regex-engine deps
-# Used for backward compatibility test metadata
-ron = "0.8"
-tfhe-backward-compat-data = { git = "https://github.com/zama-ai/tfhe-backward-compat-data.git", branch = "v0.4", default-features = false, features = [
-    "load",
-] }
 strum = { version = "0.26", features = ["derive"] }

 [build-dependencies]
-cbindgen = { version = "0.26.0", optional = true }
+cbindgen = { version = "0.28", optional = true }

 [dependencies]
-tfhe-csprng = { version = "0.5.0", path = "../tfhe-csprng", features = ["parallel"] }
+tfhe-csprng = { version = "0.5.0", path = "../tfhe-csprng", features = [
+    "parallel",
+] }
 serde = { workspace = true, features = ["default", "derive"] }
 rayon = { workspace = true }
 bincode = "1.3.3"
-tfhe-fft = { version = "0.6.0", path = "../tfhe-fft", features = [
+tfhe-fft = { version = "0.7.0", path = "../tfhe-fft", features = [
    "serde",
    "fft128",
 ] }
-tfhe-ntt = { version = "0.3.0", path = "../tfhe-ntt" }
+tfhe-ntt = { version = "0.4.0", path = "../tfhe-ntt" }
 pulp = { workspace = true, features = ["default"] }
-tfhe-cuda-backend = { version = "0.6.0", path = "../backends/tfhe-cuda-backend", optional = true }
+tfhe-cuda-backend = { version = "0.7.0", path = "../backends/tfhe-cuda-backend", optional = true }
 aligned-vec = { workspace = true, features = ["default", "serde"] }
 dyn-stack = { workspace = true, features = ["default"] }
 paste = "1.0.7"
@@ -72,14 +69,14 @@ fs2 = { version = "0.4.3", optional = true }
 sha3 = { version = "0.10", optional = true }
 itertools = { workspace = true }
 rand_core = { version = "0.6.4", features = ["std"] }
-tfhe-zk-pok = { version = "0.3.1", path = "../tfhe-zk-pok", optional = true }
-tfhe-versionable = { version = "0.3.2", path = "../utils/tfhe-versionable" }
+tfhe-zk-pok = { version = "0.4.0", path = "../tfhe-zk-pok", optional = true }
+tfhe-versionable = { version = "0.4.0", path = "../utils/tfhe-versionable" }

 # wasm deps
 wasm-bindgen = { workspace = true, features = [
    "serde-serialize",
 ], optional = true }
-wasm-bindgen-rayon = { version = "=1.2.2", optional = true }
+wasm-bindgen-rayon = { version = "1.3.0", optional = true }
 js-sys = { version = "0.3", optional = true }
 console_error_panic_hook = { version = "0.1.7", optional = true }
 serde-wasm-bindgen = { version = "0.6.0", optional = true }
@@ -131,16 +128,17 @@ __profiling = []

 software-prng = ["tfhe-csprng/software-prng"]

-# Cover several profiles as we cannot have a wildcard it seems
-[package.metadata.wasm-pack.profile.dev.wasm-bindgen]
-split-linked-modules = true
-
-[package.metadata.wasm-pack.profile.release.wasm-bindgen]
-split-linked-modules = true
-
 [package.metadata.docs.rs]
 # TODO: manage builds for docs.rs based on their documentation https://docs.rs/about
-features = ["boolean", "shortint", "integer", "gpu", "zk-pok", "software-prng"]
+features = [
+    "boolean",
+    "shortint",
+    "integer",
+    "gpu",
+    "zk-pok",
+    "software-prng",
+    "strings",
+]
 rustdoc-args = ["--html-in-header", "katex-header.html"]

 ###########
@@ -321,7 +319,7 @@ crate-type = ["lib", "staticlib", "cdylib"]
 [lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = [
    'cfg(tarpaulin)',
-    'cfg(tfhe_lints)',
+    'cfg(dylint_lib, values(any()))',
    # This is a bug/unwanted behavior from wasm_bindgen macro, for now warn instead of erroring
    'cfg(wasm_bindgen_unstable_test_coverage)',
 ] }
--- a/tfhe/benches/core_crypto/ks_bench.rs
+++ b/tfhe/benches/core_crypto/ks_bench.rs
@@ -9,8 +9,10 @@ use tfhe::core_crypto::prelude::*;
 use tfhe::keycache::NamedParam;
 use tfhe::shortint::parameters::{
    COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
-    PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
 };
 #[cfg(feature = "gpu")]
 use tfhe::shortint::parameters::{
@@ -21,13 +23,13 @@ use tfhe::shortint::parameters::{
 };
 #[cfg(not(feature = "gpu"))]
 use tfhe::shortint::parameters::{
-    PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::shortint::{MultiBitPBSParameters, PBSParameters};
@@ -35,28 +37,28 @@ use tfhe::shortint::{MultiBitPBSParameters, PBSParameters};
 #[cfg(not(feature = "gpu"))]
 const SHORTINT_BENCH_PARAMS: [ClassicPBSParameters; 5] = [
    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
-    PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
 ];

 #[cfg(feature = "gpu")]
 const SHORTINT_BENCH_PARAMS: [ClassicPBSParameters; 4] = [
    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
-    PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
 ];

 #[cfg(not(feature = "gpu"))]
 const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 6] = [
-    PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
 ];

 #[cfg(feature = "gpu")]
--- a/tfhe/benches/core_crypto/pbs_bench.rs
+++ b/tfhe/benches/core_crypto/pbs_bench.rs
@@ -17,10 +17,10 @@ const SHORTINT_BENCH_PARAMS_TUNIFORM: [ClassicPBSParameters; 1] =
    [PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64];

 const SHORTINT_BENCH_PARAMS_GAUSSIAN: [ClassicPBSParameters; 4] = [
-    PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-    PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
 ];

 const BOOLEAN_BENCH_PARAMS: [(&str, BooleanParameters); 2] = [
@@ -57,17 +57,17 @@ fn throughput_benchmark_parameters_64bits() -> Vec<(String, CryptoParametersReco
    let parameters = if cfg!(feature = "gpu") {
        vec![
            PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
-            PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
        ]
    } else {
        vec![
            PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
-            PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M64,
        ]
    };

@@ -102,12 +102,12 @@ fn multi_bit_benchmark_parameters_64bits(
        ]
    } else {
        vec![
-            PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-            PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+            V0_11_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
        ]
    };

@@ -832,13 +832,20 @@ mod cuda {
    use tfhe::core_crypto::prelude::*;
    use tfhe::keycache::NamedParam;
    use tfhe::shortint::parameters::{
-        PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_2_CARRY_0_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_2_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
-        PARAM_MESSAGE_3_CARRY_0_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_3_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_4_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_4_CARRY_3_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_5_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_6_CARRY_0_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_7_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+        V0_11_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_2_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_2_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_3_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_3_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_4_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_4_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_5_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_6_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_7_CARRY_0_KS_PBS_GAUSSIAN_2M64,
    };
    use tfhe::shortint::{ClassicPBSParameters, PBSParameters};

@@ -846,19 +853,19 @@ mod cuda {
        // TUniform
        PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
        // Gaussian
-        PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_2_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_2_CARRY_1_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_3_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_3_CARRY_2_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_4_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_4_CARRY_3_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_5_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_6_CARRY_0_KS_PBS_GAUSSIAN_2M64,
-        PARAM_MESSAGE_7_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_2_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_2_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_3_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_3_CARRY_2_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_4_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_4_CARRY_3_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_5_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_6_CARRY_0_KS_PBS_GAUSSIAN_2M64,
+        V0_11_PARAM_MESSAGE_7_CARRY_0_KS_PBS_GAUSSIAN_2M64,
    ];

    fn cuda_benchmark_parameters_64bits() -> Vec<(String, CryptoParametersRecord<u64>)> {
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
--- a/tfhe/benches/integer/glwe_packing_compression.rs
+++ b/tfhe/benches/integer/glwe_packing_compression.rs
@@ -6,6 +6,7 @@ use crate::utilities::{
 };
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use rayon::prelude::*;
+use std::cmp::max;
 use tfhe::integer::ciphertext::CompressedCiphertextListBuilder;
 use tfhe::integer::{ClientKey, RadixCiphertext};
 use tfhe::keycache::NamedParam;
@@ -77,9 +78,19 @@ fn cpu_glwe_packing(c: &mut Criterion) {
                });
            }
            BenchmarkType::Throughput => {
+                // Execute the operation once to know its cost.
+                let ct = cks.encrypt_radix(0_u32, num_blocks);
+                let mut builder = CompressedCiphertextListBuilder::new();
+                builder.push(ct);
+                let compressed = builder.build(&compression_key);
+
+                reset_pbs_count();
+                let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap();
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                let num_block =
                    (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                // FIXME thread usage seemed to be somewhat more "efficient".
                //  For example, with bit_size = 2, my laptop is only using around 2/3 of the
                // available threads  Thread usage increases with bit_size = 8 but
@@ -150,6 +161,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
 #[cfg(feature = "gpu")]
 mod cuda {
    use super::*;
+    use std::cmp::max;
    use tfhe::core_crypto::gpu::CudaStreams;
    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
@@ -185,27 +197,26 @@ mod cuda {
            let bench_id_pack;
            let bench_id_unpack;

+            // Generate private compression key
+            let cks = ClientKey::new(param);
+            let private_compression_key = cks.new_compression_private_key(comp_param);
+
+            // Generate and convert compression keys
+            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
+            let (compressed_compression_key, compressed_decompression_key) =
+                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
+            let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
+            let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
+                radix_cks.parameters().glwe_dimension(),
+                radix_cks.parameters().polynomial_size(),
+                radix_cks.parameters().message_modulus(),
+                radix_cks.parameters().carry_modulus(),
+                radix_cks.parameters().ciphertext_modulus(),
+                &stream,
+            );
+
            match BENCH_TYPE.get().unwrap() {
                BenchmarkType::Latency => {
-                    // Generate private compression key
-                    let cks = ClientKey::new(param);
-                    let private_compression_key = cks.new_compression_private_key(comp_param);
-
-                    // Generate and convert compression keys
-                    let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
-                    let (compressed_compression_key, compressed_decompression_key) = radix_cks
-                        .new_compressed_compression_decompression_keys(&private_compression_key);
-                    let cuda_compression_key =
-                        compressed_compression_key.decompress_to_cuda(&stream);
-                    let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
-                        radix_cks.parameters().glwe_dimension(),
-                        radix_cks.parameters().polynomial_size(),
-                        radix_cks.parameters().message_modulus(),
-                        radix_cks.parameters().carry_modulus(),
-                        radix_cks.parameters().ciphertext_modulus(),
-                        &stream,
-                    );
-
                    // Encrypt
                    let ct = cks.encrypt_radix(0_u32, num_blocks);
                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -239,28 +250,25 @@ mod cuda {
                    });
                }
                BenchmarkType::Throughput => {
+                    // Execute the operation once to know its cost.
+                    let (cpu_compression_key, cpu_decompression_key) =
+                        cks.new_compression_decompression_keys(&private_compression_key);
+                    let ct = cks.encrypt_radix(0_u32, num_blocks);
+                    let mut builder = CompressedCiphertextListBuilder::new();
+                    builder.push(ct);
+                    let compressed = builder.build(&cpu_compression_key);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    let _: RadixCiphertext =
+                        compressed.get(0, &cpu_decompression_key).unwrap().unwrap();
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                    let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
                        .ceil() as usize;
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));

-                    let cks = ClientKey::new(param);
-                    let private_compression_key = cks.new_compression_private_key(comp_param);
-
-                    let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
-                    let (compressed_compression_key, compressed_decompression_key) = radix_cks
-                        .new_compressed_compression_decompression_keys(&private_compression_key);
-                    let cuda_compression_key =
-                        compressed_compression_key.decompress_to_cuda(&stream);
-                    let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
-                        radix_cks.parameters().glwe_dimension(),
-                        radix_cks.parameters().polynomial_size(),
-                        radix_cks.parameters().message_modulus(),
-                        radix_cks.parameters().carry_modulus(),
-                        radix_cks.parameters().ciphertext_modulus(),
-                        &stream,
-                    );
-
                    // Encrypt
                    let ct = cks.encrypt_radix(0_u32, num_blocks);
                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -344,6 +352,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);

 #[cfg(feature = "gpu")]
 use cuda::gpu_glwe_packing2;
+use tfhe::{get_pbs_count, reset_pbs_count};

 fn main() {
    BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap());
--- a/tfhe/benches/integer/oprf.rs
+++ b/tfhe/benches/integer/oprf.rs
@@ -4,9 +4,11 @@ use crate::utilities::{
 };
 use criterion::{black_box, Criterion, Throughput};
 use rayon::prelude::*;
+use std::cmp::max;
 use tfhe::integer::keycache::KEY_CACHE;
 use tfhe::integer::IntegerKeyKind;
 use tfhe::keycache::NamedParam;
+use tfhe::{get_pbs_count, reset_pbs_count};
 use tfhe_csprng::seeders::Seed;

 pub fn unsigned_oprf(c: &mut Criterion) {
@@ -40,12 +42,21 @@ pub fn unsigned_oprf(c: &mut Criterion) {
                });
            }
            BenchmarkType::Throughput => {
+                let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                reset_pbs_count();
+                sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                    Seed(0),
+                    bit_size as u64,
+                    num_block as u64,
+                );
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
-                    let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                    b.iter(|| {
                        (0..elements).into_par_iter().for_each(|_| {
                            sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
--- a/tfhe/benches/integer/signed_bench.rs
+++ b/tfhe/benches/integer/signed_bench.rs
@@ -8,6 +8,7 @@ use crate::utilities::{
 use criterion::{criterion_group, Criterion, Throughput};
 use rand::prelude::*;
 use rayon::prelude::*;
+use std::cmp::max;
 use std::env;
 use tfhe::integer::keycache::KEY_CACHE;
 use tfhe::integer::prelude::*;
@@ -66,12 +67,20 @@ fn bench_server_key_signed_binary_function_clean_inputs<F>(
                });
            }
            BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                reset_pbs_count();
+                binary_op(&sks, &ct_0, &ct_1);
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                    let mut cts_0 = (0..elements)
                        .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                        .collect::<Vec<_>>();
@@ -151,12 +160,21 @@ fn bench_server_key_signed_shift_function_clean_inputs<F>(
                });
            }
            BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_1 = rng.gen_range(0u128..bit_size as u128);
+                let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let ct_1 = cks.encrypt_radix(clear_1, num_block);
+
+                reset_pbs_count();
+                binary_op(&sks, &ct_0, &ct_1);
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                    let mut cts_0 = (0..elements)
                        .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                        .collect::<Vec<_>>();
@@ -233,12 +251,19 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                });
            }
            BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                reset_pbs_count();
+                unary_fn(&sks, &ct_0);
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                    let mut cts_0 = (0..elements)
                        .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                        .collect::<Vec<_>>();
@@ -307,12 +332,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) {
                });
            }
            BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
+                let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                reset_pbs_count();
+                sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                    let cts_cond = (0..elements)
                        .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
                        .collect::<Vec<_>>();
@@ -830,12 +864,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
                });
            }
            BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let clear_1 = rng_func(&mut rng, bit_size);
+
+                reset_pbs_count();
+                binary_op(&sks, &mut ct_0, clear_1);
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                    let mut cts_0 = (0..elements)
                        .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                        .collect::<Vec<_>>();
@@ -1328,6 +1370,7 @@ mod cuda {
    use super::*;
    use criterion::criterion_group;
    use rayon::iter::IntoParallelRefIterator;
+    use std::cmp::max;
    use tfhe::core_crypto::gpu::CudaStreams;
    use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
    use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
@@ -1335,11 +1378,12 @@ mod cuda {

    /// Base function to bench a server key function that is a binary operation, input ciphertext
    /// will contain only zero carries
-    fn bench_cuda_server_key_binary_signed_function_clean_inputs<F>(
+    fn bench_cuda_server_key_binary_signed_function_clean_inputs<F, G>(
        c: &mut Criterion,
        bench_name: &str,
        display_name: &str,
        binary_op: F,
+        binary_op_cpu: G,
    ) where
        F: Fn(
                &CudaServerKey,
@@ -1347,6 +1391,7 @@ mod cuda {
                &mut CudaSignedRadixCiphertext,
                &CudaStreams,
            ) + Sync,
+        G: Fn(&ServerKey, &SignedRadixCiphertext, &SignedRadixCiphertext) + Sync,
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1401,14 +1446,22 @@ mod cuda {
                    });
                }
                BenchmarkType::Throughput => {
+                    let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                    let mut ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1);
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                        let mut cts_0 = (0..elements)
                            .map(|_| {
                                let clearlow = rng.gen::<u128>();
@@ -1460,7 +1513,7 @@ mod cuda {
    }

    macro_rules! define_cuda_server_key_bench_clean_input_signed_fn (
-        (method_name: $server_key_method:ident, display_name:$name:ident) => {
+        (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
            ::paste::paste!{
                fn [<cuda_ $server_key_method>](c: &mut Criterion) {
                    bench_cuda_server_key_binary_signed_function_clean_inputs(
@@ -1469,6 +1522,9 @@ mod cuda {
                        stringify!($name),
                        |server_key, lhs, rhs, stream| {
                            server_key.$server_key_method(lhs, rhs, stream);
+                        },
+                        |server_key_cpu, lhs, rhs| {
+                            server_key_cpu.$server_key_method_cpu(lhs, rhs);
                        }
                    )
                }
@@ -1478,13 +1534,15 @@ mod cuda {

    /// Base function to bench a server key function that is a unary operation, input ciphertext
    /// will contain only zero carries
-    fn bench_cuda_server_key_unary_signed_function_clean_inputs<F>(
+    fn bench_cuda_server_key_unary_signed_function_clean_inputs<F, G>(
        c: &mut Criterion,
        bench_name: &str,
        display_name: &str,
        unary_op: F,
+        unary_op_cpu: G,
    ) where
        F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, &CudaStreams) + Sync,
+        G: Fn(&ServerKey, &SignedRadixCiphertext) + Sync,
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1527,14 +1585,21 @@ mod cuda {
                    });
                }
                BenchmarkType::Throughput => {
+                    let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    unary_op_cpu(&cpu_sks, &ct_0);
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                        let mut cts_0 = (0..elements)
                            .map(|_| {
                                let clearlow = rng.gen::<u128>();
@@ -1572,7 +1637,7 @@ mod cuda {
    }

    macro_rules! define_cuda_server_key_bench_clean_input_signed_unary_fn (
-        (method_name: $server_key_method:ident, display_name:$name:ident) => {
+        (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
            ::paste::paste!{
                fn [<cuda_ $server_key_method>](c: &mut Criterion) {
                    bench_cuda_server_key_unary_signed_function_clean_inputs(
@@ -1581,6 +1646,9 @@ mod cuda {
                        stringify!($name),
                        |server_key, input, stream| {
                            server_key.$server_key_method(input, stream);
+                        },
+                        |server_key_cpu, lhs| {
+                            server_key_cpu.$server_key_method_cpu(lhs);
                        }
                    )
                }
@@ -1588,15 +1656,17 @@ mod cuda {
        }
    );

-    fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G>(
+    fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G, H>(
        c: &mut Criterion,
        bench_name: &str,
        display_name: &str,
        binary_op: F,
-        rng_func: G,
+        binary_op_cpu: G,
+        rng_func: H,
    ) where
        F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, ScalarType, &CudaStreams) + Sync,
-        G: Fn(&mut ThreadRng, usize) -> ScalarType,
+        G: Fn(&ServerKey, &mut SignedRadixCiphertext, ScalarType) + Sync,
+        H: Fn(&mut ThreadRng, usize) -> ScalarType,
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1650,16 +1720,24 @@ mod cuda {
                    });
                }
                BenchmarkType::Throughput => {
+                    let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                    let clear_0 = rng_func(&mut rng, bit_size);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    binary_op_cpu(&cpu_sks, &mut ct_0, clear_0);
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                    bench_id = format!(
                        "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
                    );
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                        let mut cts_0 = (0..elements)
                            .map(|_| {
                                let clearlow = rng.gen::<u128>();
@@ -1702,7 +1780,7 @@ mod cuda {
    }

    macro_rules! define_cuda_server_key_bench_clean_input_scalar_signed_fn (
-        (method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
+        (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
            ::paste::paste!{
                fn [<cuda_ $server_key_method>](c: &mut Criterion) {
                    bench_cuda_server_key_binary_scalar_signed_function_clean_inputs(
@@ -1712,6 +1790,9 @@ mod cuda {
                        |server_key, lhs, rhs, stream| {
                            server_key.$server_key_method(lhs, rhs, stream);
                        },
+                        |server_key_cpu, lhs, rhs| {
+                            server_key_cpu.$server_key_method_cpu(lhs, rhs);
+                        },
                        $($rng_fn)*
                    )
                }
@@ -1721,11 +1802,12 @@ mod cuda {

    /// Base function to bench a server key function that is a binary operation for shift/rotate,
    /// input ciphertext will contain only zero carries
-    fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs<F>(
+    fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs<F, G>(
        c: &mut Criterion,
        bench_name: &str,
        display_name: &str,
        binary_op: F,
+        binary_op_cpu: G,
    ) where
        F: Fn(
                &CudaServerKey,
@@ -1733,6 +1815,7 @@ mod cuda {
                &mut CudaUnsignedRadixCiphertext,
                &CudaStreams,
            ) + Sync,
+        G: Fn(&ServerKey, &SignedRadixCiphertext, &RadixCiphertext) + Sync,
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1786,14 +1869,23 @@ mod cuda {
                    });
                }
                BenchmarkType::Throughput => {
+                    let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let clear_1 = rng.gen_range(0u128..bit_size as u128);
+                    let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                    let ct_1 = cks.encrypt_radix(clear_1, num_block);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    binary_op_cpu(&cpu_sks, &ct_0, &ct_1);
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                        let mut cts_0 = (0..elements)
                            .map(|_| {
                                let clearlow = rng.gen::<u128>();
@@ -1843,7 +1935,7 @@ mod cuda {
    }

    macro_rules! define_cuda_server_key_bench_clean_input_signed_shift_rotate (
-        (method_name: $server_key_method:ident, display_name:$name:ident) => {
+        (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
            ::paste::paste!{
                fn [<cuda_ $server_key_method>](c: &mut Criterion) {
                    bench_cuda_server_key_shift_rotate_signed_function_clean_inputs(
@@ -1852,6 +1944,9 @@ mod cuda {
                        stringify!($name),
                        |server_key, lhs, rhs, stream| {
                            server_key.$server_key_method(lhs, rhs, stream);
+                        },
+                        |server_key_cpu, lhs, rhs| {
+                            server_key_cpu.$server_key_method_cpu(lhs, rhs);
                        }
                    )
                }
@@ -1916,14 +2011,23 @@ mod cuda {
                    });
                }
                BenchmarkType::Throughput => {
+                    let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5));
+                    let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                    let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    cpu_sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                        let cts_cond = (0..elements)
                            .map(|_| {
                                let ct_cond = cks.encrypt_bool(rng.gen::<bool>());
@@ -1997,246 +2101,291 @@ mod cuda {

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_add,
+        method_name_cpu: unchecked_add_parallelized,
        display_name: add
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_sub,
+        method_name_cpu: unchecked_sub,
        display_name: sub
    );

    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
        method_name: unchecked_neg,
+        method_name_cpu: unchecked_neg,
        display_name: neg
    );

    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
        method_name: unchecked_abs,
+        method_name_cpu: unchecked_abs_parallelized,
        display_name: abs
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_mul,
+        method_name_cpu: unchecked_mul_parallelized,
        display_name: mul
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_div_rem,
+        method_name_cpu: unchecked_div_rem_parallelized,
        display_name: div_mod
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_bitand,
+        method_name_cpu: unchecked_bitand_parallelized,
        display_name: bitand
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_bitor,
+        method_name_cpu: unchecked_bitor_parallelized,
        display_name: bitor
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_bitxor,
+        method_name_cpu: unchecked_bitxor_parallelized,
        display_name: bitxor
    );

    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
        method_name: unchecked_bitnot,
+        method_name_cpu: bitnot,
        display_name: bitnot
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: unchecked_rotate_left,
+        method_name_cpu: unchecked_rotate_left_parallelized,
        display_name: rotate_left
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: unchecked_rotate_right,
+        method_name_cpu: unchecked_rotate_right_parallelized,
        display_name: rotate_right
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: unchecked_left_shift,
+        method_name_cpu: unchecked_left_shift_parallelized,
        display_name: left_shift
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: unchecked_right_shift,
+        method_name_cpu: unchecked_right_shift_parallelized,
        display_name: right_shift
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_eq,
+        method_name_cpu: unchecked_eq_parallelized,
        display_name: eq
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_ne,
+        method_name_cpu: unchecked_ne_parallelized,
        display_name: ne
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_gt,
+        method_name_cpu: unchecked_gt_parallelized,
        display_name: gt
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_ge,
+        method_name_cpu: unchecked_ge_parallelized,
        display_name: ge
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_lt,
+        method_name_cpu: unchecked_lt_parallelized,
        display_name: lt
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_le,
+        method_name_cpu: unchecked_le_parallelized,
        display_name: le
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_min,
+        method_name_cpu: unchecked_min_parallelized,
        display_name: min
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_max,
+        method_name_cpu: unchecked_max_parallelized,
        display_name: max
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_signed_overflowing_add,
+        method_name_cpu: unchecked_signed_overflowing_add_parallelized,
        display_name: overflowing_add
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: unchecked_signed_overflowing_sub,
+        method_name_cpu: unchecked_signed_overflowing_sub_parallelized,
        display_name: overflowing_sub
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_add,
+        method_name_cpu: unchecked_scalar_add,
        display_name: add,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_mul,
+        method_name_cpu: unchecked_scalar_mul_parallelized,
        display_name: mul,
        rng_func: mul_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_sub,
+        method_name_cpu: unchecked_scalar_sub,
        display_name: sub,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_bitand,
+        method_name_cpu: unchecked_scalar_bitand_parallelized,
        display_name: bitand,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_bitor,
+        method_name_cpu: unchecked_scalar_bitor_parallelized,
        display_name: bitor,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_bitxor,
+        method_name_cpu: unchecked_scalar_bitxor_parallelized,
        display_name: bitxor,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_right_shift,
+        method_name_cpu: unchecked_scalar_right_shift_parallelized,
        display_name: right_shift,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_left_shift,
+        method_name_cpu: unchecked_scalar_left_shift_parallelized,
        display_name: left_shift,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_rotate_right,
+        method_name_cpu: unchecked_scalar_rotate_right_parallelized,
        display_name: rotate_right,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_rotate_left,
+        method_name_cpu: unchecked_scalar_rotate_left_parallelized,
        display_name: rotate_left,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_eq,
+        method_name_cpu: unchecked_scalar_eq_parallelized,
        display_name: eq,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_ne,
+        method_name_cpu: unchecked_scalar_ne_parallelized,
        display_name: ne,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_gt,
+        method_name_cpu: unchecked_scalar_gt_parallelized,
        display_name: gt,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_ge,
+        method_name_cpu: unchecked_scalar_ge_parallelized,
        display_name: ge,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_lt,
+        method_name_cpu: unchecked_scalar_lt_parallelized,
        display_name: lt,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_le,
+        method_name_cpu: unchecked_scalar_le_parallelized,
        display_name: le,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_min,
+        method_name_cpu: unchecked_scalar_min_parallelized,
        display_name: min,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_scalar_max,
+        method_name_cpu: unchecked_scalar_max_parallelized,
        display_name: max,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: signed_overflowing_scalar_add,
+        method_name_cpu: signed_overflowing_scalar_add_parallelized,
        display_name: overflowing_add,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: signed_overflowing_scalar_sub,
+        method_name_cpu: signed_overflowing_scalar_sub_parallelized,
        display_name: overflowing_sub,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: unchecked_signed_scalar_div_rem,
+        method_name_cpu: unchecked_signed_scalar_div_rem_parallelized,
        display_name: div_rem,
        rng_func: div_scalar
    );
@@ -2247,234 +2396,277 @@ mod cuda {

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: add,
+        method_name_cpu: add_parallelized,
        display_name: add
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: sub,
+        method_name_cpu: sub_parallelized,
        display_name: sub
    );

    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
        method_name: neg,
+        method_name_cpu: neg_parallelized,
        display_name: neg
    );

    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
        method_name: abs,
+        method_name_cpu: abs_parallelized,
        display_name: abs
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: mul,
+        method_name_cpu: mul_parallelized,
        display_name: mul
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: div_rem,
+        method_name_cpu: div_rem_parallelized,
        display_name: div_mod
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: bitand,
+        method_name_cpu: bitand_parallelized,
        display_name: bitand
    );

    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
        method_name: bitnot,
+        method_name_cpu: bitnot,
        display_name: bitnot
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: bitor,
+        method_name_cpu: bitor_parallelized,
        display_name: bitor
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: bitxor,
+        method_name_cpu: bitxor_parallelized,
        display_name: bitxor
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: rotate_left,
+        method_name_cpu: rotate_left_parallelized,
        display_name: rotate_left
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: rotate_right,
+        method_name_cpu: rotate_right_parallelized,
        display_name: rotate_right
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: left_shift,
+        method_name_cpu: left_shift_parallelized,
        display_name: left_shift
    );

    define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
        method_name: right_shift,
+        method_name_cpu: right_shift_parallelized,
        display_name: right_shift
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: eq,
+        method_name_cpu: eq_parallelized,
        display_name: eq
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: ne,
+        method_name_cpu: ne_parallelized,
        display_name: ne
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: gt,
+        method_name_cpu: gt_parallelized,
        display_name: gt
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: ge,
+        method_name_cpu: ge_parallelized,
        display_name: ge
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: lt,
+        method_name_cpu: lt_parallelized,
        display_name: lt
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: le,
+        method_name_cpu: le_parallelized,
        display_name: le
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: min,
+        method_name_cpu: min_parallelized,
        display_name: min
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: max,
+        method_name_cpu: max_parallelized,
        display_name: max
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: signed_overflowing_add,
+        method_name_cpu: signed_overflowing_add_parallelized,
        display_name: overflowing_add
    );

    define_cuda_server_key_bench_clean_input_signed_fn!(
        method_name: signed_overflowing_sub,
+        method_name_cpu: signed_overflowing_sub_parallelized,
        display_name: overflowing_sub
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_add,
+        method_name_cpu: scalar_add_parallelized,
        display_name: add,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_mul,
+        method_name_cpu: scalar_mul_parallelized,
        display_name: mul,
        rng_func: mul_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_sub,
+        method_name_cpu: scalar_sub_parallelized,
        display_name: sub,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_bitand,
+        method_name_cpu: scalar_bitand_parallelized,
        display_name: bitand,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_bitor,
+        method_name_cpu: scalar_bitor_parallelized,
        display_name: bitor,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_bitxor,
+        method_name_cpu: scalar_bitxor_parallelized,
        display_name: bitxor,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_left_shift,
+        method_name_cpu: scalar_left_shift_parallelized,
        display_name: left_shift,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_right_shift,
+        method_name_cpu: scalar_right_shift_parallelized,
        display_name: right_shift,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_rotate_left,
+        method_name_cpu: scalar_rotate_left_parallelized,
        display_name: rotate_left,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_rotate_right,
+        method_name_cpu: scalar_rotate_right_parallelized,
        display_name: rotate_right,
        rng_func: shift_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_eq,
+        method_name_cpu: scalar_eq_parallelized,
        display_name: eq,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_ne,
+        method_name_cpu: scalar_ne_parallelized,
        display_name: ne,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_gt,
+        method_name_cpu: scalar_gt_parallelized,
        display_name: gt,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_ge,
+        method_name_cpu: scalar_ge_parallelized,
        display_name: ge,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_lt,
+        method_name_cpu: scalar_lt_parallelized,
        display_name: lt,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_le,
+        method_name_cpu: scalar_le_parallelized,
        display_name: le,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_min,
+        method_name_cpu: scalar_min_parallelized,
        display_name: min,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: scalar_max,
+        method_name_cpu: scalar_max_parallelized,
        display_name: max,
        rng_func: default_signed_scalar
    );

    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
        method_name: signed_scalar_div_rem,
+        method_name_cpu: signed_scalar_div_rem_parallelized,
        display_name: div_rem,
        rng_func: div_scalar
    );
@@ -2697,6 +2889,7 @@ use cuda::{
    cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops,
    unchecked_cuda_ops, unchecked_scalar_cuda_ops,
 };
+use tfhe::{get_pbs_count, reset_pbs_count};

 #[cfg(feature = "gpu")]
 fn go_through_gpu_bench_groups(val: &str) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	cae938a75b	wip: measure latencies of a list of cts	2025-01-28 11:19:27 +01:00
David Testé	bae1d1cf77	WIP: fix gpu streams and use iter_batched	2025-01-22 10:56:08 +01:00
David Testé	a3bc1a9d9e	chore(bench): new heuristic to define elements for throughput This is done to fill up backend with enough elements to fill the backend and avoid having long execution time for heavy operations like multiplication or division.	2025-01-20 15:21:05 +01:00
Nicolas Sarlin	cc85c441ea	chore(doc): add strings feature to doctests	2025-01-20 10:59:12 +01:00
Mayeul@Zama	1f254d6523	chore: feature gating js_high_level_api	2025-01-20 10:00:49 +01:00
Mayeul@Zama	909ce4ecbb	chore: update rust dependencies	2025-01-20 09:42:38 +01:00
Mayeul@Zama	67783f4683	chore: upgrade wasm-bindgen	2025-01-20 09:42:38 +01:00
tmontaigu	c62112a4a9	feat(c-hlapi): add abs	2025-01-17 17:04:18 +01:00
tmontaigu	9eb2eb9f0e	refactor(c-hlapi): have different macro for FheUint/FheInt As FheUint and FheInt may have different set of functions, we now have 2 different macros to define their C API.	2025-01-17 17:04:18 +01:00
Arthur Meyre	aa5b431aae	test(core): use 5 bits parameters tweaked to 4 bits to avoid high pfail	2025-01-17 16:06:27 +01:00
David Testé	2b914ae57a	chore(ci): use ubuntu 22.04 to run security checks Sagemath is not available as a package on Ubuntu 24.04.	2025-01-17 11:31:40 +01:00
Nicolas Sarlin	f4a8991f67	fix(safe_ser): aliases in named for renamed types deserialization	2025-01-17 11:13:43 +01:00
David Testé	a882262691	chore(ci): fix recursive search for changes in docs files	2025-01-17 10:04:46 +01:00
aquint-zama	1976a9dce6	chore: update release workflow	2025-01-16 16:04:58 +01:00
aquint-zama	ec87c15cc2	chore: add slsa support for tfhe-ntt	2025-01-16 16:04:58 +01:00
aquint-zama	c273e973bb	chore: add slsa support for tfhe-fft	2025-01-16 16:04:58 +01:00
aquint-zama	07e3fb2779	chore: add slsa support for tfhe-cuda-backend	2025-01-16 16:04:58 +01:00
aquint-zama	c2d4e77eec	chore: add slsa support for tfhe-versionable	2025-01-16 16:04:58 +01:00
aquint-zama	cdf627f2d5	chore: add slsa support for tfhe-csprng	2025-01-16 16:04:58 +01:00
aquint-zama	2f79f646f7	chore: add slsa support for tfhe-zk-pok	2025-01-16 16:04:58 +01:00
Guillermo Oyarzun	a9e4724178	feat(gpu): implement fhe rand on gpu	2025-01-16 14:48:09 +01:00
David Testé	7a8efb1934	chore(ci): display head ref in slack notification message Default 'Ref' displayed in message relies on github.ref value. On pull_request_target, it's the base_ref, instead of head_ref, that is set as value for github.ref. We cannot change 'Ref' field directly. As a workaround, we hide 'Ref' in the message and display the head_ref directly in SLACK_MESSAGE.	2025-01-16 14:25:05 +01:00
David Testé	bc1aeeb85e	chore(ci): skip aws fast tests if ci files changed This would skip 'check-user-permission' job if the event 'pull_request_target' is emitted and CI files have changed. It avoids overlapping of 'pull_request' and 'pull_request_target' events. CI changes would only be tested on 'pull_request' for Zama own pull requests.	2025-01-16 14:25:05 +01:00
Nicolas Sarlin	cbdba38147	fix(shortint): error message in packing keyswitch	2025-01-16 13:08:36 +01:00
Agnes Leroy	cea871fc6b	chore(gpu): update multi-bit parameters	2025-01-16 10:23:04 +01:00
Arthur Meyre	34a006a3ee	chore(ci): fix clippy on M1 for disabled multi bit noise	2025-01-16 09:44:21 +01:00
Pedro Alves	b3740e75f2	chore(docs): Remove mention to NVLink NVLink is not needed anymore in the CUDA backend.	2025-01-16 09:35:15 +01:00
Andrei Stoian	b46affa45b	chore(gpu): add reference to gemm algorithm	2025-01-16 09:07:58 +01:00
Mayeul@Zama	72095144dc	chore: fix new lints	2025-01-15 15:16:15 +01:00
Arthur Meyre	a91e8618c9	chore(docs): fix various issues with the docs	2025-01-15 11:37:04 +01:00
Nicolas Sarlin	9a64c34989	chore(lint): use dylint as lint driver for tfhe-lint	2025-01-14 18:30:04 +01:00
Nicolas Sarlin	7103a83ce5	chore(zk)!: use builtin isqrt instead of the internal implementation BREAKING CHANGE: bump MSRV	2025-01-14 18:29:29 +01:00
Nicolas Sarlin	1f41a6b85d	chore(zk): sqr fc takes u64 as parameter to avoid overflow	2025-01-14 18:29:29 +01:00
David Testé	ccc647a5ee	chore(ci): remove paths for pull_request_target event A 'paths:' directive with only excluded paths won't trigger the event.	2025-01-14 17:37:48 +01:00
Arthur Meyre	ffd4f5a93e	chore(ci): remove unused Dockerfile for wasm tests	2025-01-14 17:31:18 +01:00
David Testé	594157ecaa	chore(ci): choose pull request event with path changes To be able to run CI for external contribution, through forked repository, and be able to test CI modification in a development branch, we need to discriminate pull request events. For the former 'pull_request_target' event is needed to have access to GitHub secrets. 'pull_request' event is required for the latter otherwise the workflow would be pulled from the HEAD of the base branch and thus wouldn't contain changes from the developer.	2025-01-14 14:38:02 +01:00
dependabot[bot]	8ae871ec33	chore(deps): bump actions/upload-artifact from 4.5.0 to 4.6.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.5.0 to 4.6.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`6f51ac03b9...65c4c4a1dd`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-14 13:48:39 +01:00
dependabot[bot]	9535544409	chore(deps): bump actions-cool/check-user-permission from 2.2.1 to 2.3.0 Bumps [actions-cool/check-user-permission](https://github.com/actions-cool/check-user-permission) from 2.2.1 to 2.3.0. - [Release notes](https://github.com/actions-cool/check-user-permission/releases) - [Changelog](https://github.com/actions-cool/check-user-permission/blob/main/CHANGELOG.md) - [Commits](`956b2e73cd...7b90a27f92`) --- updated-dependencies: - dependency-name: actions-cool/check-user-permission dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-14 13:48:31 +01:00
dependabot[bot]	4438042b7d	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.19 to 3.0.20. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`6ae615f647...c3a2b64f69`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-14 13:48:23 +01:00
Arthur Meyre	f7189edb20	chore(ci): temporarily disable the multi bit noise check on Apple M1	2025-01-14 10:56:36 +01:00
Nicolas Sarlin	7058b3611a	chore(backward): update data repo to 0.5	2025-01-14 09:35:32 +01:00
Pedro Alves	59b315993d	chore(gpu): update new_multi_gpu() description	2025-01-14 09:24:24 +01:00
Arthur Meyre	3d1998635a	docs: indicate PBS benchmarks have Gaussian parameters	2025-01-13 16:57:40 +01:00
Arthur Meyre	3fa72e62ae	chore(docs): replace tabs by spaces	2025-01-13 16:57:40 +01:00
Arthur Meyre	0d43912884	docs: add TUniform distribution and link in benchmarks	2025-01-13 16:57:40 +01:00
Agnes Leroy	9930550b69	chore(doc): update links to the benchmark tables	2025-01-13 16:57:40 +01:00
Nicolas Sarlin	bdc3539954	doc(zk): explain how to use zkv1	2025-01-13 13:40:16 +01:00
tmontaigu	979a1b36f8	chore(docs): add strings guides	2025-01-13 13:25:47 +01:00
Andrei Stoian	298fd66631	feat(gpu): optimize packing keyswitch on gpu	2025-01-13 09:18:53 -03:00
David Testé	0952dfa1ad	chore(ci): run teardown job only if setup is a success	2025-01-13 12:19:39 +01:00
Mayeul@Zama	e1e567a145	chore: remove unused cargo key	2025-01-13 12:03:06 +01:00
Nicolas Sarlin	ee1a534584	chore: formatting uniformization for c and js	2025-01-10 16:09:04 +01:00
Nicolas Sarlin	c9eef7d193	chore: enable space in editorconfig for more file types	2025-01-10 16:09:04 +01:00
Nicolas Sarlin	4c8d55f32b	chore(zk): add bench zk v1 vs v2	2025-01-10 15:47:25 +01:00
Nicolas Sarlin	cd5b3c61eb	chore(backward): move compat tests to avoid compiling them everytime	2025-01-10 09:43:04 +01:00
Arthur Meyre	baefb7d911	chore: enable strings for docs.rs generation	2025-01-09 19:48:43 +01:00
Guillermo Oyarzun	d2a3450ab9	fix(gpu): fix cornercase in match value function	2025-01-09 17:51:31 +01:00
David Testé	6fb13328ec	chore(ci): put back weekly core_crypto benchmarks	2025-01-09 17:45:14 +01:00
David Testé	f633eedc29	chore(ci): push tfhe-cuda-backend to crates.io unconditionally	2025-01-09 12:10:55 +01:00
Nicolas Sarlin	a9fb3e9fbf	chore(trivium): fix params, reverted back to gaussian	2025-01-09 09:19:04 +01:00
Arthur Meyre	9a4b584419	chore(ci): make cargo-builds-ntt unique to ntt	2025-01-08 17:05:42 +01:00
Arthur Meyre	cdcba5ca13	chore(ci): make test names unique for fft and ntt	2025-01-08 17:05:42 +01:00
Nicolas Sarlin	adf52acd90	chore: add js fmt to "make conformance" and remove linelint autofix which has false positives	2025-01-08 13:23:14 +01:00
Nicolas Sarlin	9ac89fc6bf	fix(bench): use correct name for parameters in wasm benches	2025-01-08 13:23:14 +01:00
Agnes Leroy	a668112694	chore(test): modify cpu multi-bit parameters for noise test	2025-01-08 09:02:07 +01:00
David Testé	ba105cd1d0	chore(ci): relocate permission checking after should-run step This induces a failure if the job has to run AND if the triggering actor isn't a member of the zama-ai organization. That would help tfhe-rs maintainers to re-run only workflows that are supposed to run. The reference is selected based on the event emitted. We also now use token with restricted permission to check out the repository.	2025-01-07 17:36:41 +01:00
David Testé	3690ad0b25	chore(ci): remove pull_request which duplicate pull_request_target Previously pull_request and pull_request_target events were both emitted thus leading one cancelling the other because of concurrency group name format. Since external contribution needs to be allowed we only need pull_request_target event.	2025-01-07 17:36:41 +01:00
yuxizama	b9ddeebd29	chore(docs): update discord link	2025-01-07 17:27:46 +01:00
Nicolas Sarlin	bc742e989a	chore(zk-pok): bump to 0.4.0	2025-01-07 15:36:49 +01:00
Nicolas Sarlin	17c714f153	chore(versionable): bump to 0.4.0	2025-01-07 15:36:49 +01:00
Arthur Meyre	e0a264dfa0	chore: update docs version to be 0.11	2025-01-07 14:53:17 +01:00
tmontaigu	0551f4a1cc	feat(hlapi): add strings	2025-01-07 14:52:42 +01:00
Arthur Meyre	54c2f4d14d	chore: bump ntt to 0.4.0	2025-01-07 13:47:08 +01:00
Arthur Meyre	aa12c75312	chore: bump tfhe-fft to 0.7.0	2025-01-07 13:47:08 +01:00
Nicolas Sarlin	3c3e2d720f	chore(shortint): update compression parameters	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	a7bf1cdb43	chore(shortint): update keyswitch 1_1 to 2_2 parameters	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	f06b04fd83	chore(zk)!: store inside the pke params the supported zk scheme BREAKING_CHANGE: - Zk for compact PKE now requires dedicated encryption parameters	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	c19683a320	chore(shortint): move tuniform 0.10 parameters into their own folder	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	96ca0d4f7c	chore(shortint): re-export v0.10 params at top level	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	b6d1b5ffff	chore(shortint): update multibit gaussian parameters	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	8ee1bdd9a9	chore(shortint): update classic gaussian param	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	58801cf7a5	chore(shortint): update gaussian compact pk parameters	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	010fb790c2	chore(shortint): remove tuniform pbs_ks parameters	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	8a795c79ac	chore(shortint): rename wopbs params to LEGACY_	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	f4c956636f	chore(zk): add a test with zkv1 in integer compact list encryption	2025-01-07 12:02:09 +01:00
Nicolas Sarlin	cdca7be20b	chore(shortint): new parameters for tuniform	2025-01-07 12:02:09 +01:00
Agnes Leroy	b7f1318815	chore(gpu): bump backend version	2025-01-07 11:33:16 +01:00
Arthur Meyre	721cb3bcbf	chore(ci): fix workflow not triggering on pull request	2025-01-07 10:34:07 +01:00
Pedro Alves	53fed5eb21	fix(gpu): fix delta calculation when Torus is not a 64-bit type	2025-01-06 17:36:36 -03:00
Pedro Alves	e1b57fabe0	chore(gpu): port fix to compression encoding - Modifies the generation of the LUT used in decompression so that the delta is calculated with a different precision, as in the CPU implementation	2025-01-06 17:36:36 -03:00