refactor(gpu): mono-kernel TBC

2026-04-28 03:01:21 -04:00 · 2024-08-16 15:45:09 +00:00
673 changed files with 24188 additions and 68863 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,10 +44,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -72,7 +75,7 @@ jobs:
          echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"

      - name: Clone test data
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
          repository: zama-ai/tfhe-backward-compat-data
@@ -87,7 +90,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -100,7 +103,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,7 +114,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -26,7 +26,6 @@ jobs:
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
-      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -51,13 +50,13 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,15 +64,10 @@ jobs:
              - tfhe/Cargo.toml
              - concrete-csprng/**
              - tfhe-zk-pok/**
-              - utils/tfhe-versionable/**
-              - utils/tfhe-versionable-derive/**
            csprng:
              - concrete-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
-            versionable:
-              - utils/tfhe-versionable/**
-              - utils/tfhe-versionable-derive/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -109,7 +103,6 @@ jobs:
        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
-          steps.changed-files.outputs.versionable_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -131,7 +124,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -151,10 +144,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -171,11 +167,6 @@ jobs:
        run: |
          make test_zk_pok

-      - name: Run tfhe-versionable tests
-        if: needs.should-run.outputs.versionable_test == 'true'
-        run: |
-          make test_versionable
-
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -194,8 +185,7 @@ jobs:
      - name: Run js on wasm API tests
        if: needs.should-run.outputs.wasm_test == 'true'
        run: |
-          make install_node
-          make test_nodejs_wasm_api_ci
+          make test_nodejs_wasm_api_in_docker

      - name: Gen Keys if required
        if: needs.should-run.outputs.shortint_test == 'true' ||
@@ -223,9 +213,9 @@ jobs:
          make test_safe_deserialization

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -238,7 +228,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -249,7 +239,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -26,11 +26,6 @@ on:

 jobs:
  should-run:
-    if:
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
-      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
@@ -39,14 +34,14 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -72,7 +67,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,10 +85,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: "false"
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -122,9 +120,9 @@ jobs:
          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -137,7 +135,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -148,7 +146,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -26,11 +26,6 @@ on:

 jobs:
  should-run:
-    if:
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
-      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
@@ -39,14 +34,14 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -72,7 +67,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,10 +85,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: "false"
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -126,9 +124,9 @@ jobs:
          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -141,7 +139,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -152,7 +150,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -57,13 +57,13 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -131,7 +131,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -151,10 +151,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -219,9 +222,9 @@ jobs:
          make test_kreyvium

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -234,7 +237,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -245,7 +248,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,21 +45,22 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

-      - name: Install web resources
+      - name: Install Node
        run: |
          make install_node
-          make install_chrome_browser
-          make install_chrome_web_driver

      - name: Run fmt checks
        run: |
@@ -67,16 +68,16 @@ jobs:

      - name: Run js on wasm API tests
        run: |
-          make test_nodejs_wasm_api_ci
+          make test_nodejs_wasm_api_in_docker

      - name: Run parallel wasm tests
        run: |
-          make test_web_js_api_parallel_chrome_ci
+          make test_web_js_api_parallel_ci

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -89,7 +90,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,7 +101,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -1,146 +0,0 @@
-# Run all ERC20 benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: ERC20 benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 5a.m.
-    - cron: '0 5 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (erc20-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: bench
-
-  erc20-benchmarks:
-    name: Execute ERC20 benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    continue-on-error: true
-    timeout-minutes: 720  # 12 hours
-    steps:
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Run benchmarks
-        run: |
-          make bench_hlapi_erc20
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512
-
-      - name: Parse PBS counts
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
-          --append-results
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_erc20
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "ERC20 benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (erc20-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, erc20-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -1,195 +0,0 @@
-# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: ERC20 GPU H100 benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 5a.m.
-    - cron: '0 5 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-erc20-benchmarks)
-    runs-on: ubuntu-latest
-    if:  github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-erc20-benchmarks:
-    name: Execute GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks
-        run: |
-          make bench_hlapi_erc20_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512
-
-      - name: Parse PBS counts
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
-          --append-results
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_erc20
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-erc20-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-erc20-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_2H100_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_2H100_full.yml
@@ -1,194 +0,0 @@
-# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer 2xH100 benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: 2-h100
-
-  cuda-integer-full-2-gpu-benchmarks:
-    name: Execute 2xH100 integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x2" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_l40.yml
+++ b/.github/workflows/benchmark_gpu_l40.yml
@@ -1,206 +0,0 @@
-# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
-name: Cuda benchmarks (L40)
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-l40-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: l40 
-
-  cuda-l40-benchmarks:
-    name: Cuda benchmarks (L40)
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Run compression benchmarks with AVX512
-        run: |
-          make bench_integer_compression_gpu
-
-      - name: Run PBS benchmarks 
-        run: |
-          make bench_pbs_gpu
-
-      - name: Run KS benchmarks 
-        run: |
-          make bench_ks_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-L40x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-l40-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
-          SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-l40-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,10 +48,9 @@ jobs:
    continue-on-error: true
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -95,17 +94,17 @@ jobs:
      - name: Parse key sizes results
        run: |
          python3 ./ci/benchmark_parser.py tfhe/boolean_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
+          --key-sizes \
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -114,13 +113,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -133,7 +140,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -144,7 +151,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (boolean-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -25,7 +25,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -28,6 +28,3 @@ jobs:
          excludeTitle: "true" # optional: this excludes the title of a pull request
          checkAllCommitMessages: "true" # optional: this checks all commits associated with a pull request
          accessToken: ${{ secrets.GITHUB_TOKEN }} # github access token is only required if checkAllCommitMessages is true
-
-      - name: Check commit signatures
-        uses: 1Password/check-signed-commits-action@ed2885f3ed2577a4f5d3c3fe895432a557d23d52
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Get actionlint
        run: |
@@ -25,9 +25,3 @@ jobs:
      - name: Lint workflows
        run: |
          make lint_workflow
-
-      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ed00f72a3ca5b6eff8ad4d3ffdcacedb67a21db1 # v3.0.15
-        with:
-          allowlist: |
-            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,7 +44,11 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -53,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files_yaml: |
            tfhe:
@@ -83,7 +87,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238
+        uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +101,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238
+        uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -108,7 +112,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -121,7 +125,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -132,7 +136,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,10 +44,9 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -87,13 +86,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -102,13 +101,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -121,7 +128,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -132,7 +139,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -56,7 +56,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -64,12 +64,10 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -130,13 +128,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -145,8 +143,16 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
@@ -156,7 +162,7 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-core-crypto-benchmarks.result }}
          SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ needs.cuda-core-crypto-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -169,7 +175,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -180,7 +186,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,10 +45,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -62,7 +65,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -75,7 +78,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -86,7 +89,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -117,7 +117,7 @@ jobs:
    - name: Slack Notification
      if: ${{ always() && job.status == 'failure' }}
      continue-on-error: true
-      uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+      uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
      env:
        SLACK_COLOR: ${{ job.status }}
        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
--- a/.github/workflows/gpu_4090_benchmark.yml
+++ b/.github/workflows/gpu_4090_benchmark.yml
@@ -39,10 +39,9 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -59,7 +58,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -83,7 +82,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -91,13 +90,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -114,7 +121,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -132,7 +139,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -157,7 +164,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -177,9 +184,9 @@ jobs:
          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ !success() && !cancelled() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -34,10 +34,9 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -75,9 +74,9 @@ jobs:
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -28,13 +28,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,7 +44,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -108,10 +108,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -148,7 +147,6 @@ jobs:
      - name: Run core crypto and internal CUDA backend tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
-          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

      - name: Run user docs tests
@@ -167,11 +165,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -184,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -195,7 +193,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -27,13 +27,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -43,7 +43,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -63,7 +63,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -106,10 +106,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -146,7 +145,6 @@ jobs:
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
-          make test_integer_compression_gpu
          make test_cuda_backend

      - name: Run user docs tests
@@ -165,11 +163,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -182,7 +180,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -193,7 +191,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -1,156 +0,0 @@
-# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: TFHE Cuda Backend - Full tests on H100
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-
-on:
-  workflow_dispatch:
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-h100-tests)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-tests-linux:
-    name: CUDA H100 tests
-    needs: [ setup-instance ]
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: stable
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run core crypto, integer and internal CUDA backend tests
-        run: |
-          make test_gpu
-
-      - name: Run user docs tests
-        run: |
-          make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          make test_high_level_api_gpu
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    if: ${{ failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-h100-tests)
-    needs: [ setup-instance, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -28,13 +28,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,7 +44,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -108,10 +108,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -145,10 +144,6 @@ jobs:
        if: ${{ !cancelled() }}
        run: nvidia-smi

-      - name: Run multi-bit CUDA integer compression tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
-
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
@@ -170,11 +165,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -187,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -198,7 +193,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,10 +53,9 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -95,9 +94,9 @@ jobs:
          make pcc_gpu

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -110,7 +109,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -121,7 +120,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -28,14 +28,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -45,7 +44,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -66,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,7 +99,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -108,9 +107,8 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Set up home
        run: |
@@ -156,11 +154,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -173,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +182,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -36,13 +36,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -52,7 +52,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -73,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -107,7 +107,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -115,12 +115,10 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -170,11 +168,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -187,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -198,7 +196,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -28,14 +28,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -45,7 +44,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -66,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,7 +99,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -108,9 +107,8 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Set up home
        run: |
@@ -156,11 +154,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -173,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +182,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -35,14 +35,13 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -52,7 +51,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
+              - tfhe/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -73,7 +72,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -107,7 +106,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -115,9 +114,8 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Set up home
        run: |
@@ -167,11 +165,11 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -184,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -195,7 +193,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_cpu_benchmark.yml
+++ b/.github/workflows/integer_cpu_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,10 +87,9 @@ jobs:
        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -111,7 +110,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -126,12 +125,6 @@ jobs:
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}

-      # Run these benchmarks only once
-      - name: Run compression benchmarks with AVX512
-        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
-        run: |
-          make bench_integer_compression
-
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -146,7 +139,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -154,13 +147,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -173,7 +174,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +185,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -59,7 +59,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -68,10 +68,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -125,7 +124,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -145,13 +144,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -160,8 +159,16 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
@@ -171,7 +178,7 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -184,7 +191,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -195,7 +202,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
      max-parallel: 1
      matrix:
        command: [integer, integer_multi_bit]
-        op_flavor: [default]
+        op_flavor: [default, unchecked]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -63,7 +63,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -72,10 +72,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -116,7 +115,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -130,12 +129,6 @@ jobs:
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu

-      # Run these benchmarks only once
-      - name: Run compression benchmarks with AVX512
-        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
-        run: |
-          make bench_integer_compression_gpu
-
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -151,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -159,18 +152,26 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -183,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -194,7 +195,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -42,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -72,7 +72,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -81,10 +81,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -148,7 +147,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -168,13 +167,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -183,18 +182,27 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
          SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -207,7 +215,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -218,7 +226,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
@@ -42,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -73,7 +73,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -82,10 +82,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -126,7 +125,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -165,7 +164,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -173,18 +172,26 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
          SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -197,7 +204,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -208,7 +215,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -39,7 +39,7 @@ jobs:
          profile: multi-h100

  cuda-integer-full-multi-gpu-benchmarks:
-    name: Execute multi GPU integer benchmarks
+    name: Execute multi GPU integer benchmarks for all operations flavor
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
@@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -63,7 +63,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -72,10 +72,9 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -116,7 +115,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -145,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -153,18 +152,26 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -177,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -188,7 +195,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -34,7 +34,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'

@@ -149,7 +149,7 @@ jobs:
      - name: Slack Notification
        if: ${{ needs.cargo-builds.result != 'skipped' }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cargo-builds.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -36,13 +36,13 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
      - name: Prepare package
        run: |
          cargo package -p tfhe
-      - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate
          path: target/package/*.crate
@@ -74,7 +74,7 @@ jobs:
      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
      - name: Create NPM version tag
@@ -101,7 +101,7 @@ jobs:
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: failure
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
@@ -146,7 +146,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -31,7 +31,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_concrete_tfhe_versionable.yml
+++ b/.github/workflows/make_release_concrete_tfhe_versionable.yml
@@ -2,11 +2,6 @@ name: Publish tfhe-versionable release

 on:
  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -17,28 +12,21 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

-      - name: Publish proc-macro crate
-        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
-        run: |
-          cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
-
-      - name: Publish main crate
-        if: ${{ ! inputs.dry_run }}
+      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
        run: |
+          cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
          cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -29,14 +29,14 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: aws
-          profile: gpu-build
+          profile: gpu-test

  publish-cuda-release:
    name: Publish CUDA Release
@@ -54,7 +54,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -99,7 +99,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -123,7 +123,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -32,7 +32,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,10 +17,10 @@ jobs:
    runs-on: large_ubuntu_16
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Checkout lattice-estimator
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
@@ -42,7 +42,7 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/shortint_cpu_benchmark.yml
+++ b/.github/workflows/shortint_cpu_benchmark.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,10 +79,9 @@ jobs:
        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -103,7 +102,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -138,11 +137,11 @@ jobs:
        if: matrix.op_flavor == 'default'
        run: |
          python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
+          --key-sizes \
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -150,13 +149,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -169,7 +176,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -180,7 +187,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (shortint-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/signed_integer_cpu_benchmark.yml
+++ b/.github/workflows/signed_integer_cpu_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,10 +87,9 @@ jobs:
        op_flavor: [ default, unchecked ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -111,7 +110,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -140,7 +139,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -148,13 +147,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -167,7 +174,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -178,7 +185,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (signed-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
      - name: git-sync
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -33,13 +33,13 @@ jobs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -64,7 +64,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,16 +78,11 @@ jobs:
    needs: setup-instance
    if: needs.setup-instance.result != 'skipped'
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      max-parallel: 1
-      matrix:
-        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -97,20 +92,20 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
      - name: Install rust
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

-      - name: Install web resources
-        run: |
-          make install_node
-          make install_${{ matrix.browser }}_browser
-          make install_${{ matrix.browser }}_web_driver
-
      - name: Run benchmarks
        run: |
-          make bench_web_js_api_parallel_${{ matrix.browser }}_ci
+          make install_node
+          make bench_web_js_api_parallel_ci

      - name: Parse results
        run: |
@@ -123,29 +118,25 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --key-gen
-          rm tfhe/wasm_pk_gen.csv

-      # Run these benchmarks only once
      - name: Measure public key and ciphertext sizes in HL Api
-        if:  matrix.browser == 'chrome'
        run: |
          make measure_hlapi_compact_pk_ct_sizes

      - name: Parse key and ciphertext sizes results
-        if:  matrix.browser == 'chrome'
        run: |
          python3 ./ci/benchmark_parser.py tfhe/hlapi_cpk_and_cctl_sizes.csv ${{ env.RESULTS_FILENAME }} \
          --key-gen \
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
-          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
+          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -154,16 +145,24 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "WASM benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (wasm-client-benchmarks)
@@ -173,7 +172,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +183,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/zk_pke_benchmark.yml
+++ b/.github/workflows/zk_pke_benchmark.yml
@@ -30,13 +30,13 @@ jobs:
      zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,10 +84,9 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -108,7 +107,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -135,17 +134,17 @@ jobs:
      - name: Parse CRS sizes results
        run: |
          python3 ./ci/benchmark_parser.py tfhe/pke_zk_crs_sizes.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
+          --key-sizes \
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -154,13 +153,21 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ !success() && !cancelled() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "PKE ZK benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -173,7 +180,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +191,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@65e6fc1ce697e2df8149d9ae9909acc5ec5599ce
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (pke-zk-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.gitignore
+++ b/.gitignore
@@ -26,12 +26,6 @@ backends/tfhe-cuda-backend/cuda/cmake-build-debug/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
-venv/
-web-test-runner/

 # Dir used for backward compatibility test data
 tfhe/tfhe-backward-compat-data/
-
-# Sampling tool stuff
-/venv/
-**/*.algo_sample_acquistion
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,9 +9,13 @@ members = [
    "backends/tfhe-cuda-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
-    "tfhe-rs-cost-model"
 ]
-exclude = ["tfhe/backward_compatibility_tests"]
+
+exclude = [
+    "tfhe/backward_compatibility_tests",
+    "utils/cargo-tfhe-lints-inner",
+    "utils/cargo-tfhe-lints"
+]

 [profile.bench]
 lto = "fat"
--- a/317
+++ b/317
@@ -21,16 +21,20 @@ BENCH_OP_FLAVOR?=DEFAULT
 NODE_VERSION=22.6
 FORWARD_COMPAT?=OFF
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=v0.3
+BACKWARD_COMPAT_DATA_BRANCH?=v0.1
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
-WEB_RUNNER_DIR=web-test-runner
-WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
 # copy paste the command in the terminal and change them if required without forgetting the flags
 export RUSTFLAGS?=-C target-cpu=native

+ifeq ($(AVX512_SUPPORT),ON)
+		AVX512_FEATURE=nightly-avx512
+else
+		AVX512_FEATURE=
+endif
+
 ifeq ($(GEN_KEY_CACHE_MULTI_BIT_ONLY),TRUE)
 		MULTI_BIT_ONLY=--multi-bit-only
 else
@@ -142,63 +146,6 @@ install_tfhe_lints:
 	(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
 	cd utils/cargo-tfhe-lints && cargo install --path .

-.PHONY: install_typos_checker # Install typos checker
-install_typos_checker: install_rs_build_toolchain
-	@typos --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
-	( echo "Unable to install typos-cli, unknown error." && exit 1 )
-
-.PHONY: setup_venv # Setup Python virtualenv for wasm tests
-setup_venv:
-	python3 -m venv venv
-	@source venv/bin/activate && \
-	pip3 install -r ci/webdriver_requirements.txt
-
-# This is an internal target, not meant to be called on its own.
-install_web_resource:
-	wget -P $(dest) $(url)
-	@cd $(dest) && \
-	echo "$(checksum) $(filename)" > checksum && \
-	sha256sum -c checksum && \
-	rm checksum && \
-	$(decompress_cmd) $(filename)
-
-install_chrome_browser: url = "https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.137/linux64/chrome-linux64.zip"
-install_chrome_browser: checksum = "c5d7da679f3a353ae4e4420ab113de06d4bd459152f5b17558390c02d9520566"
-install_chrome_browser: dest = "$(WEB_RUNNER_DIR)/chrome"
-install_chrome_browser: filename = "chrome-linux64.zip"
-install_chrome_browser: decompress_cmd = unzip
-
-.PHONY: install_chrome_browser # Install Chrome browser for Linux
-install_chrome_browser: install_web_resource
-
-install_chrome_web_driver: url = "https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.137/linux64/chromedriver-linux64.zip"
-install_chrome_web_driver: checksum = "f041092f403fb7455a6da2871070b6587c32814a3e3c2b0a794d3d4aa4739151"
-install_chrome_web_driver: dest = "$(WEB_RUNNER_DIR)/chrome"
-install_chrome_web_driver: filename = "chromedriver-linux64.zip"
-install_chrome_web_driver: decompress_cmd = unzip
-
-.PHONY: install_chrome_web_driver # Install Chrome web driver for Linux
-install_chrome_web_driver: install_web_resource
-
-install_firefox_browser: url = "https://download-installer.cdn.mozilla.net/pub/firefox/releases/131.0/linux-x86_64/en-US/firefox-131.0.tar.bz2"
-install_firefox_browser: checksum = "4ca8504a62a31472ecb8c3a769d4301dd4ac692d4cc5d51b8fe2cf41e7b11106"
-install_firefox_browser: dest = "$(WEB_RUNNER_DIR)/firefox"
-install_firefox_browser: filename = "firefox-131.0.tar.bz2"
-install_firefox_browser: decompress_cmd = tar -xvf
-
-.PHONY: install_firefox_browser # Install firefox browser for Linux
-install_firefox_browser: install_web_resource
-
-install_firefox_web_driver: url = "https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz"
-install_firefox_web_driver: checksum = "ac26e9ba8f3b8ce0fbf7339b9c9020192f6dcfcbf04a2bcd2af80dfe6bb24260"
-install_firefox_web_driver: dest = "$(WEB_RUNNER_DIR)/firefox"
-install_firefox_web_driver: filename = "geckodriver-v0.35.0-linux64.tar.gz"
-install_firefox_web_driver: decompress_cmd = tar -xvf
-
-.PHONY: install_firefox_web_driver # Install firefox web driver for Linux
-install_firefox_web_driver: install_web_resource
-
 .PHONY: check_linelint_installed # Check if linelint newline linter is installed
 check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
@@ -260,10 +207,6 @@ check_fmt_js: check_nvm_installed
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt

-.PHONY: check_typos # Check for typos in codebase
-check_typos: install_typos_checker
-	@typos && echo "No typos found"
-
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -271,13 +214,6 @@ clippy_gpu: install_rs_check_toolchain
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

-.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
-check_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
-		--all-targets \
-		-p $(TFHE_SPEC)
-
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -316,18 +252,12 @@ clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),shortint \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),integer,experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
 clippy: install_rs_check_toolchain
@@ -354,9 +284,6 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
@@ -376,9 +303,6 @@ clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
 clippy_concrete_csprng: install_rs_check_toolchain
@@ -391,17 +315,9 @@ clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok -- --no-deps -D warnings

-.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
-clippy_versionable: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-versionable-derive -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-versionable -- --no-deps -D warnings
-
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_noise_measurement
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -412,18 +328,10 @@ clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings

-.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
-check_rust_bindings_did_not_change:
-	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
-	git diff --quiet HEAD -- backends/tfhe-cuda-backend/src/bindings.rs || \
-	( echo "Generated bindings have changed! Please run 'git add backends/tfhe-cuda-backend/src/bindings.rs' \
-	and commit the changes." && exit 1 ) 
-
-
 .PHONY: tfhe_lints # Run custom tfhe-rs lints
 tfhe_lints: install_tfhe_lints
 	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -431,7 +339,7 @@ build_core: install_rs_build_toolchain install_rs_check_toolchain
 		--features=$(TARGET_ARCH_FEATURE) -p $(TFHE_SPEC)
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),nightly-avx512 -p $(TFHE_SPEC); \
+			--features=$(TARGET_ARCH_FEATURE),$(AVX512_FEATURE) -p $(TFHE_SPEC); \
 	fi

 .PHONY: build_core_experimental # Build core_crypto with experimental features
@@ -440,7 +348,7 @@ build_core_experimental: install_rs_build_toolchain install_rs_check_toolchain
 		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC)
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 -p $(TFHE_SPEC); \
+			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC); \
 	fi

 .PHONY: build_boolean # Build with boolean enabled
@@ -468,23 +376,32 @@ build_tfhe_coverage: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests

+.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
+symlink_c_libs_without_fingerprint:
+	@./scripts/symlink_c_libs_without_fingerprint.sh \
+		--cargo-profile "$(CARGO_PROFILE)" \
+		--lib-name tfhe-c-api-dynamic-buffer
+
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
@@ -502,7 +419,6 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
 		-Z build-std=panic_abort,std && \
 	find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
-	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json

 .PHONY: build_node_js_api # Build the js API targeting nodejs
 build_node_js_api: install_rs_build_toolchain install_wasm_pack
@@ -522,7 +438,7 @@ test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok,nightly-avx512 -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -536,7 +452,7 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,nightly-avx512 \
+			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
 			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
 	fi

@@ -565,20 +481,6 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

-.PHONY: test_integer_compression
-test_integer_compression: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compress
-
-.PHONY: test_integer_compression_gpu
-test_integer_compression_gpu: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
-
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -832,7 +734,7 @@ test_zk_pok: install_rs_build_toolchain
 .PHONY: test_versionable # Run tests for tfhe-versionable subcrate
 test_versionable: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--all-targets -p tfhe-versionable
+		-p tfhe-versionable

 # The backward compat data repo holds historical binary data but also rust code to generate and load them.
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
@@ -840,7 +742,7 @@ test_versionable: install_rs_build_toolchain
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
-		--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
 test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
@@ -913,58 +815,36 @@ check_compile_tests_benches_gpu: install_rs_build_toolchain
 		cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
 		"$(MAKE)" -j "$(CPU_COUNT)"

+.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
+build_nodejs_test_docker:
+	DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
+		-f docker/Dockerfile.wasm_tests --build-arg NODE_VERSION=$(NODE_VERSION) -t tfhe-wasm-tests .
+
+.PHONY: test_nodejs_wasm_api_in_docker # Run tests for the nodejs on wasm API in a docker container
+test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
+	if [[ -t 1 ]]; then RUN_FLAGS="-it"; else RUN_FLAGS="-i"; fi && \
+	docker run --rm "$${RUN_FLAGS}" \
+		-v "$$(pwd)":/tfhe-wasm-tests/tfhe-rs \
+		-v tfhe-rs-root-target-cache:/root/tfhe-rs-target \
+		-v tfhe-rs-pkg-cache:/tfhe-wasm-tests/tfhe-rs/tfhe/pkg \
+		-v tfhe-rs-root-cargo-registry-cache:/root/.cargo/registry \
+		-v tfhe-rs-root-cache:/root/.cache \
+		tfhe-wasm-tests /bin/bash -i -c 'make test_nodejs_wasm_api'
+
 .PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
 test_nodejs_wasm_api: build_node_js_api
-	cd tfhe/js_on_wasm_tests && npm install && npm run test
+	cd tfhe/js_on_wasm_tests && npm run test

-.PHONY: test_nodejs_wasm_api_ci # Run tests for the nodejs on wasm API
-test_nodejs_wasm_api_ci: build_node_js_api
+.PHONY: test_web_js_api_parallel # Run tests for the web wasm api
+test_web_js_api_parallel: build_web_js_api_parallel
+	$(MAKE) -C tfhe/web_wasm_parallel_tests test
+
+.PHONY: test_web_js_api_parallel_ci # Run tests for the web wasm api
+test_web_js_api_parallel_ci: build_web_js_api_parallel
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) test_nodejs_wasm_api
-
-# This is an internal target, not meant to be called on its own.
-run_web_js_api_parallel: build_web_js_api_parallel setup_venv
-	cd $(WEB_SERVER_DIR) && npm install && npm run build
-	source venv/bin/activate && \
-	python ci/webdriver.py \
-	--browser-path $(browser_path) \
-	--driver-path $(driver_path) \
-	--browser-kind  $(browser_kind) \
-	--server-cmd "npm run server" \
-	--server-workdir "$(WEB_SERVER_DIR)" \
-	--id-pattern $(filter)
-
-test_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-test_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-test_web_js_api_parallel_chrome: browser_kind = chrome
-test_web_js_api_parallel_chrome: filter = Test
-
-.PHONY: test_web_js_api_parallel_chrome # Run tests for the web wasm api on Chrome
-test_web_js_api_parallel_chrome: run_web_js_api_parallel
-
-.PHONY: test_web_js_api_parallel_chrome_ci # Run tests for the web wasm api on Chrome
-test_web_js_api_parallel_chrome_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) test_web_js_api_parallel_chrome
-
-test_web_js_api_parallel_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-test_web_js_api_parallel_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-test_web_js_api_parallel_firefox: browser_kind = firefox
-test_web_js_api_parallel_firefox: filter = Test
-
-.PHONY: test_web_js_api_parallel_firefox # Run tests for the web wasm api on Firefox
-test_web_js_api_parallel_firefox: run_web_js_api_parallel
-
-.PHONY: test_web_js_api_parallel_firefox_ci # Run tests for the web wasm api on Firefox
-test_web_js_api_parallel_firefox_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) test_web_js_api_parallel_firefox
+	$(MAKE) -C tfhe/web_wasm_parallel_tests test-ci

 .PHONY: no_tfhe_typo # Check we did not invert the h and f in tfhe
 no_tfhe_typo:
@@ -982,11 +862,6 @@ dieharder_csprng: install_dieharder build_concrete_csprng
 # Benchmarks
 #

-.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
-print_doc_bench_parameters:
-	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe
-
 .PHONY: bench_integer # Run benchmarks for unsigned integer
 bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
@@ -1008,18 +883,6 @@ bench_integer_gpu: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

-.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
-bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench	glwe_packing_compression-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
-
-.PHONY: bench_integer_compression_gpu
-bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench	glwe_packing_compression-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
-
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -1118,47 +981,15 @@ bench_ks_gpu: install_rs_check_toolchain
 	--bench ks-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

-bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-bench_web_js_api_parallel_chrome: browser_kind = chrome
-bench_web_js_api_parallel_chrome: filter = Bench
+.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
+bench_web_js_api_parallel: build_web_js_api_parallel
+	$(MAKE) -C tfhe/web_wasm_parallel_tests bench

-.PHONY: bench_web_js_api_parallel_chrome # Run benchmarks for the web wasm api
-bench_web_js_api_parallel_chrome: run_web_js_api_parallel
-
-.PHONY: bench_web_js_api_parallel_chrome_ci # Run benchmarks for the web wasm api
-bench_web_js_api_parallel_chrome_ci: setup_venv
+.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
+bench_web_js_api_parallel_ci: build_web_js_api_parallel
 	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_parallel_chrome
-
-bench_web_js_api_parallel_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-bench_web_js_api_parallel_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-bench_web_js_api_parallel_firefox: browser_kind = firefox
-bench_web_js_api_parallel_firefox: filter = Bench
-
-.PHONY: bench_web_js_api_parallel_firefox # Run benchmarks for the web wasm api
-bench_web_js_api_parallel_firefox: run_web_js_api_parallel
-
-.PHONY: bench_web_js_api_parallel_firefox_ci # Run benchmarks for the web wasm api
-bench_web_js_api_parallel_firefox_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_parallel_firefox
-
-.PHONY: bench_hlapi_erc20 # Run benchmarks for ECR20 operations
-bench_hlapi_erc20: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
-
-.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ECR20 operations on GPU
-bench_hlapi_erc20_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci

 #
 # Utility tools
@@ -1206,7 +1037,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example wasm_benchmarks_parser \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
-	-- wasm_benchmark_results.json
+	-- web_wasm_parallel_tests/test/benchmark_results

 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
@@ -1244,47 +1075,15 @@ sha256_bool: install_rs_check_toolchain
 	--example sha256_bool \
 	--features=$(TARGET_ARCH_FEATURE),boolean

-.PHONY: external_product_noise_measurement # Run scripts to run noise measurement for external_product
-external_product_noise_measurement: setup_venv_noise_measurement install_rs_check_toolchain
-	source venv/bin/activate && \
-	cd tfhe-rs-cost-model/src/ && \
-	python3 external_product_correction.py \
-		--rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--chunks "$$(nproc)" -- \
-		--algorithm multi-bit-ext-prod \
-		--multi-bit-grouping-factor 2
-
-
-.PHONY: external_product_noise_measurement_classic # Run scripts to run noise measurement for external_product
-external_product_noise_measurement_classic: setup_venv_noise_measurement install_rs_check_toolchain
-	source venv/bin/activate && \
-	cd tfhe-rs-cost-model/src/ && \
-	python3 external_product_correction.py \
-		--rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--chunks "$$(nproc)" -- \
-		--algorithm ext-prod
-
-.PHONY: clippy_noise_measurement # Run clippy lints on noise measurement tool
-clippy_noise_measurement: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-rs-cost-model -- --no-deps -D warnings
-
-.PHONY: setup_venv_noise_measurement
-setup_venv_noise_measurement:
-	python3 -m venv venv
-	source venv/bin/activate && \
-	pip install -U pip wheel setuptools && \
-	pip install -r tfhe-rs-cost-model/src/requirements.txt
-
 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
-pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
+pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
 clippy_all tfhe_lints check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
-pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change
+pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
-fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \
+fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_fast \
 check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a> 
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,15 +0,0 @@
-[default]
-extend-ignore-identifiers-re = [
-    # Related to serialized object
-    "ser",
-    "unser",
-    # Used when dumping tfhe-rs parameters set into Sage format
-    "ND.*",
-    # Related to FHE strings example handling "banana"
-    "ba",
-    "enc_ba",
-    # Example with string replacing "hello" with "herlo"
-    "herlo",
-    # Example in trivium
-    "C9217BA0D762ACA1"
-]
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -7,7 +7,7 @@ using multithreading to accelerate the computation.


 Quite similarly, the function `TriviumStream::<FheBool>::new` will return a very similar object running in FHE space. Its arguments are
-2 arrays of 80 FheBool representing the encrypted Trivium key, and the encrypted IV. It also requires a reference to the server key of the 
+2 arrays of 80 FheBool representing the encrypted Trivium key, and the encrypted IV. It also requires a reference to the the server key of the 
 current scheme. This means that any user of this feature must also have the `tfhe-rs` crate as a dependency.


--- a/apps/trivium/src/kreyvium/kreyvium.rs
+++ b/apps/trivium/src/kreyvium/kreyvium.rs
@@ -148,9 +148,10 @@ where

    /// Computes one turn of the stream, updating registers and outputting the new bit.
    pub fn next_bool(&mut self) -> T {
-        if let Some(sk) = &self.fhe_key {
-            set_server_key(sk.clone());
-        }
+        match &self.fhe_key {
+            Some(sk) => set_server_key(sk.clone()),
+            None => (),
+        };

        let [o, a, b, c] = self.get_output_and_values(0);

@@ -225,12 +226,18 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        if let Some(sk) = &self.fhe_key {
-            rayon::broadcast(|_| set_server_key(sk.clone()));
+        match &self.fhe_key {
+            Some(sk) => {
+                rayon::broadcast(|_| set_server_key(sk.clone()));
+            }
+            None => (),
        }
        let mut values = self.get_64_output_and_values();
-        if self.fhe_key.is_some() {
-            rayon::broadcast(|_| unset_server_key());
+        match &self.fhe_key {
+            Some(_) => {
+                rayon::broadcast(|_| unset_server_key());
+            }
+            None => (),
        }

        let mut ret = Vec::<T>::with_capacity(64);
--- a/apps/trivium/src/kreyvium/kreyvium_byte.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_byte.rs
@@ -237,12 +237,18 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        if let Some(sk) = &self.fhe_key {
-            rayon::broadcast(|_| set_server_key(sk.clone()));
+        match &self.fhe_key {
+            Some(sk) => {
+                rayon::broadcast(|_| set_server_key(sk.clone()));
+            }
+            None => (),
        }
        let values = self.get_64_output_and_values();
-        if self.fhe_key.is_some() {
-            rayon::broadcast(|_| unset_server_key());
+        match &self.fhe_key {
+            Some(_) => {
+                rayon::broadcast(|_| unset_server_key());
+            }
+            None => (),
        }

        let mut bytes = Vec::<T>::with_capacity(8);
--- a/apps/trivium/src/lib.rs
+++ b/apps/trivium/src/lib.rs
@@ -1,5 +1,3 @@
-#![allow(clippy::too_long_first_doc_paragraph)]
-
 mod static_deque;

 mod kreyvium;
--- a/apps/trivium/src/trivium/trivium_bool.rs
+++ b/apps/trivium/src/trivium/trivium_bool.rs
@@ -120,9 +120,10 @@ where

    /// Computes one turn of the stream, updating registers and outputting the new bit.
    pub fn next_bool(&mut self) -> T {
-        if let Some(sk) = &self.fhe_key {
-            set_server_key(sk.clone());
-        }
+        match &self.fhe_key {
+            Some(sk) => set_server_key(sk.clone()),
+            None => (),
+        };

        let [o, a, b, c] = self.get_output_and_values(0);

@@ -195,12 +196,18 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        if let Some(sk) = &self.fhe_key {
-            rayon::broadcast(|_| set_server_key(sk.clone()));
+        match &self.fhe_key {
+            Some(sk) => {
+                rayon::broadcast(|_| set_server_key(sk.clone()));
+            }
+            None => (),
        }
        let mut values = self.get_64_output_and_values();
-        if self.fhe_key.is_some() {
-            rayon::broadcast(|_| unset_server_key());
+        match &self.fhe_key {
+            Some(_) => {
+                rayon::broadcast(|_| unset_server_key());
+            }
+            None => (),
        }

        let mut ret = Vec::<T>::with_capacity(64);
--- a/apps/trivium/src/trivium/trivium_byte.rs
+++ b/apps/trivium/src/trivium/trivium_byte.rs
@@ -187,12 +187,18 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        if let Some(sk) = &self.fhe_key {
-            rayon::broadcast(|_| set_server_key(sk.clone()));
+        match &self.fhe_key {
+            Some(sk) => {
+                rayon::broadcast(|_| set_server_key(sk.clone()));
+            }
+            None => (),
        }
        let values = self.get_64_output_and_values();
-        if self.fhe_key.is_some() {
-            rayon::broadcast(|_| unset_server_key());
+        match &self.fhe_key {
+            Some(_) => {
+                rayon::broadcast(|_| unset_server_key());
+            }
+            None => (),
        }

        let mut bytes = Vec::<T>::with_capacity(8);
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.5.0"
+version = "0.4.0-alpha.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -14,4 +14,3 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
 [build-dependencies]
 cmake = { version = "0.1" }
 pkg-config = { version = "0.3" }
-bindgen = "0.70.1"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -1,5 +1,4 @@
 use std::env;
-use std::path::PathBuf;
 use std::process::Command;

 fn main() {
@@ -27,7 +26,6 @@ fn main() {
    println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks");
    println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
    println!("cargo::rerun-if-changed=src");
-
    if env::consts::OS == "linux" {
        let output = Command::new("./get_os_name.sh").output().unwrap();
        let distribution = String::from_utf8(output.stdout).unwrap();
@@ -37,7 +35,6 @@ fn main() {
                Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
            );
        }
-
        let dest = cmake::build("cuda");
        println!("cargo:rustc-link-search=native={}", dest.display());
        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
@@ -54,32 +51,6 @@ fn main() {
        println!("cargo:rustc-link-lib=cudart");
        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
        println!("cargo:rustc-link-lib=stdc++");
-
-        let header_path = "wrapper.h";
-        println!("cargo:rerun-if-changed={}", header_path);
-
-        let out_path = PathBuf::from("src").join("bindings.rs");
-
-        let bindings = bindgen::Builder::default()
-            .header(header_path)
-            // allow only what we are interested in, the custom types appearing in the interface
-            .allowlist_type("PBS_TYPE")
-            .allowlist_type("SHIFT_OR_ROTATE_TYPE")
-            // and the functions reachable from the headers included in wrapper.h
-            .allowlist_function(".*")
-            .clang_arg("-x")
-            .clang_arg("c++")
-            .clang_arg("-std=c++17")
-            .clang_arg("-I/usr/include")
-            .clang_arg("-I/usr/local/include")
-            .ctypes_prefix("ffi")
-            .raw_line("use crate::ffi;")
-            .generate()
-            .expect("Unable to generate bindings");
-
-        bindings
-            .write_to_file(&out_path)
-            .expect("Couldn't write bindings!");
    } else {
        panic!(
            "Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -67,21 +67,9 @@ endif()

 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

-# Check if the DEBUG flag is defined
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  # Debug mode
-  message("Compiling in Debug mode")
-  add_definitions(-DDEBUG)
-  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
-else()
-  # Release mode
-  message("Compiling in Release mode")
-  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
-endif()
-
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
+    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
  --use_fast_math -Xcompiler -fPIC")

--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -1,24 +1,25 @@
 #ifndef CUDA_CIPHERTEXT_H
 #define CUDA_CIPHERTEXT_H

-#include "stdint.h"
+#include "device.h"
+#include <cstdint>

 extern "C" {
 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
-                                                  void *dest, void const *src,
+                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);
 void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
                                                  uint32_t gpu_index,
-                                                  void *dest, void const *src,
+                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);

 void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
-                                 void *lwe_array_out, void const *glwe_array_in,
-                                 uint32_t const *nth_array, uint32_t num_nths,
+                                 void *lwe_array_out, void *glwe_array_in,
+                                 uint32_t *nth_array, uint32_t num_glwes,
                                 uint32_t glwe_dimension,
                                 uint32_t polynomial_size);
-}
+};
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -39,15 +39,16 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

+bool cuda_check_support_cooperative_groups();
+
+bool cuda_check_support_thread_block_clusters();
+
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

-void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
+void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);

-void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                            uint32_t gpu_index);
-
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

@@ -61,13 +62,9 @@ void cuda_synchronize_device(uint32_t gpu_index);
 void cuda_drop(void *ptr, uint32_t gpu_index);

 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
-}

 int cuda_get_max_shared_memory(uint32_t gpu_index);
-
-bool cuda_check_support_cooperative_groups();
-
-bool cuda_check_support_thread_block_clusters();
+}

 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,7 @@ extern std::mutex m;
 extern bool p2p_enabled;

 extern "C" {
-int32_t cuda_setup_multi_gpu();
+int cuda_setup_multi_gpu();
 }

 // Define a variant type that can be either a vector or a single pointer
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -1,45 +0,0 @@
-#ifndef CUDA_INTEGER_COMPRESSION_H
-#define CUDA_INTEGER_COMPRESSION_H
-
-#include "../../pbs/pbs_enums.h"
-
-extern "C" {
-void scratch_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
-    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t storage_log_modulus, uint32_t body_count,
-    bool allocate_gpu_memory);
-
-void cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
-    uint32_t num_nths, int8_t *mem_ptr);
-
-void cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
-    uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
-
-void cleanup_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-void cleanup_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -1,124 +0,0 @@
-#ifndef CUDA_INTEGER_COMPRESSION_UTILITIES_H
-#define CUDA_INTEGER_COMPRESSION_UTILITIES_H
-
-#include "../integer_utilities.h"
-
-template <typename Torus> struct int_compression {
-  int_radix_params compression_params;
-  uint32_t storage_log_modulus;
-  uint32_t lwe_per_glwe;
-
-  uint32_t body_count;
-
-  // Compression
-  int8_t *fp_ks_buffer;
-  Torus *tmp_lwe;
-  Torus *tmp_glwe_array_out;
-
-  int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                  uint32_t gpu_count, int_radix_params compression_params,
-                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
-                  uint32_t storage_log_modulus, bool allocate_gpu_memory) {
-    this->compression_params = compression_params;
-    this->lwe_per_glwe = lwe_per_glwe;
-    this->storage_log_modulus = storage_log_modulus;
-    this->body_count = num_radix_blocks;
-
-    if (allocate_gpu_memory) {
-      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                    compression_params.polynomial_size;
-
-      tmp_lwe = (Torus *)cuda_malloc_async(
-          num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
-              sizeof(Torus),
-          streams[0], gpu_indexes[0]);
-      tmp_glwe_array_out = (Torus *)cuda_malloc_async(
-          lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
-          gpu_indexes[0]);
-
-      scratch_packing_keyswitch_lwe_list_to_glwe_64(
-          streams[0], gpu_indexes[0], &fp_ks_buffer,
-          compression_params.glwe_dimension, compression_params.polynomial_size,
-          num_radix_blocks, true);
-    }
-  }
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-    cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
-    cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
-                                               &fp_ks_buffer);
-  }
-};
-
-template <typename Torus> struct int_decompression {
-  int_radix_params encryption_params;
-  int_radix_params compression_params;
-
-  uint32_t storage_log_modulus;
-
-  uint32_t num_radix_blocks;
-  uint32_t body_count;
-
-  Torus *tmp_extracted_glwe;
-  Torus *tmp_extracted_lwe;
-  uint32_t *tmp_indexes_array;
-
-  int_radix_lut<Torus> *carry_extract_lut;
-
-  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                    uint32_t gpu_count, int_radix_params encryption_params,
-                    int_radix_params compression_params,
-                    uint32_t num_radix_blocks, uint32_t body_count,
-                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
-    this->encryption_params = encryption_params;
-    this->compression_params = compression_params;
-    this->storage_log_modulus = storage_log_modulus;
-    this->num_radix_blocks = num_radix_blocks;
-    this->body_count = body_count;
-
-    if (allocate_gpu_memory) {
-      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                    compression_params.polynomial_size;
-      Torus lwe_accumulator_size = (compression_params.glwe_dimension *
-                                        compression_params.polynomial_size +
-                                    1);
-      carry_extract_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, encryption_params, 1,
-          num_radix_blocks, allocate_gpu_memory);
-
-      tmp_extracted_glwe = (Torus *)cuda_malloc_async(
-          num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
-          gpu_indexes[0]);
-      tmp_indexes_array = (uint32_t *)cuda_malloc_async(
-          num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0]);
-      tmp_extracted_lwe = (Torus *)cuda_malloc_async(
-          num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
-          gpu_indexes[0]);
-
-      // Carry extract LUT
-      auto carry_extract_f = [encryption_params](Torus x) -> Torus {
-        return x / encryption_params.message_modulus;
-      };
-
-      generate_device_accumulator<Torus>(
-          streams[0], gpu_indexes[0],
-          carry_extract_lut->get_lut(gpu_indexes[0], 0),
-          encryption_params.glwe_dimension, encryption_params.polynomial_size,
-          encryption_params.message_modulus, encryption_params.carry_modulus,
-          carry_extract_f);
-
-      carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-    }
-  }
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-    cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);
-
-    carry_extract_lut->release(streams, gpu_indexes, gpu_count);
-    delete carry_extract_lut;
-  }
-};
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -1,421 +0,0 @@
-#ifndef CUDA_INTEGER_H
-#define CUDA_INTEGER_H
-
-#include "../pbs/pbs_enums.h"
-#include <stdint.h>
-
-enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 };
-enum SHIFT_OR_ROTATE_TYPE {
-  LEFT_SHIFT = 0,
-  RIGHT_SHIFT = 1,
-  LEFT_ROTATE = 2,
-  RIGHT_ROTATE = 3
-};
-enum BITOP_TYPE {
-  BITAND = 0,
-  BITOR = 1,
-  BITXOR = 2,
-  SCALAR_BITAND = 3,
-  SCALAR_BITOR = 4,
-  SCALAR_BITXOR = 5,
-};
-
-enum COMPARISON_TYPE {
-  EQ = 0,
-  NE = 1,
-  GT = 2,
-  GE = 3,
-  LT = 4,
-  LE = 5,
-  MAX = 6,
-  MIN = 7,
-};
-
-enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
-
-enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
-
-extern "C" {
-void scratch_cuda_apply_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_apply_univariate_lut_kb_64(void *const *streams,
-                                     uint32_t const *gpu_indexes,
-                                     uint32_t gpu_count, void *output_radix_lwe,
-                                     void const *input_radix_lwe,
-                                     int8_t *mem_ptr, void *const *ksks,
-                                     void *const *bsks, uint32_t num_blocks);
-
-void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
-                                             uint32_t const *gpu_indexes,
-                                             uint32_t gpu_count,
-                                             int8_t **mem_ptr_void);
-
-void scratch_cuda_apply_bivariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_apply_bivariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void const *input_radix_lwe_1,
-    void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
-    void *const *bsks, uint32_t num_blocks, uint32_t shift);
-
-void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
-                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count,
-                                            int8_t **mem_ptr_void);
-
-void cuda_apply_many_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks,
-    uint32_t num_luts, uint32_t lut_stride);
-
-void scratch_cuda_full_propagation_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_full_propagation_64_inplace(void *const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, void *input_blocks,
-                                      int8_t *mem_ptr, void *const *ksks,
-                                      void *const *bsks, uint32_t num_blocks);
-
-void cleanup_cuda_full_propagation(void *const *streams,
-                                   uint32_t const *gpu_indexes,
-                                   uint32_t gpu_count, int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_integer_mult_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left,
-    void const *radix_lwe_right, void *const *bsks, void *const *ksks,
-    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
-
-void cleanup_cuda_integer_mult(void *const *streams,
-                               uint32_t const *gpu_indexes, uint32_t gpu_count,
-                               int8_t **mem_ptr_void);
-
-void cuda_negate_integer_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus);
-
-void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus);
-
-void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
-
-void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
-
-void cleanup_cuda_integer_radix_logical_scalar_shift(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool is_signed, bool allocate_gpu_memory);
-
-void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
-
-void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
-                                                 uint32_t const *gpu_indexes,
-                                                 uint32_t gpu_count,
-                                                 int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_comparison_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
-
-void cuda_comparison_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t lwe_ciphertext_count);
-
-void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks);
-
-void cleanup_cuda_integer_comparison(void *const *streams,
-                                     uint32_t const *gpu_indexes,
-                                     uint32_t gpu_count, int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_bitop_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    BITOP_TYPE op_type, bool allocate_gpu_memory);
-
-void cuda_bitop_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t lwe_ciphertext_count);
-
-void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
-    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);
-
-void cleanup_cuda_integer_bitop(void *const *streams,
-                                uint32_t const *gpu_indexes, uint32_t gpu_count,
-                                int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_cmux_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_cmux_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
-    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count);
-
-void cleanup_cuda_integer_radix_cmux(void *const *streams,
-                                     uint32_t const *gpu_indexes,
-                                     uint32_t gpu_count, int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_scalar_rotate_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_radix_scalar_rotate_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
-
-void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
-                                              uint32_t const *gpu_indexes,
-                                              uint32_t gpu_count,
-                                              int8_t **mem_ptr_void);
-
-void scratch_cuda_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
-
-void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks);
-
-void cleanup_cuda_propagate_single_carry(void *const *streams,
-                                         uint32_t const *gpu_indexes,
-                                         uint32_t gpu_count,
-                                         int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
-
-void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
-    void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks_in_radix);
-
-void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_scalar_mul_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, uint64_t const *decomposed_scalar,
-    uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars);
-
-void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
-                                           uint32_t const *gpu_indexes,
-                                           uint32_t gpu_count,
-                                           int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_integer_div_rem_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
-
-void cleanup_cuda_integer_div_rem(void *const *streams,
-                                  uint32_t const *gpu_indexes,
-                                  uint32_t gpu_count, int8_t **mem_ptr_void);
-
-void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
-
-void cleanup_signed_overflowing_add_or_sub(void *const *streams,
-                                           uint32_t const *gpu_indexes,
-                                           uint32_t gpu_count,
-                                           int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_compute_prefix_sum_hillis_steele_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift);
-
-void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
-                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count, void *lwe_array,
-                                            uint32_t num_blocks,
-                                            uint32_t lwe_size);
-
-} // extern C
-#endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -1,39 +1,21 @@
 #ifndef CNCRT_KS_H_
 #define CNCRT_KS_H_

-#include <stdint.h>
+#include <cstdint>

 extern "C" {

 void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
-
-void scratch_packing_keyswitch_lwe_list_to_glwe_64(
-    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory);
-
-void cuda_packing_keyswitch_lwe_list_to_glwe_64(
-    void *stream, uint32_t gpu_index, void *glwe_array_out,
-    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
-    uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
-    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_lwes);
-
-void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
-                                                uint32_t gpu_index,
-                                                int8_t **fp_ks_buffer);
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,48 +1,50 @@
 #ifndef CUDA_LINALG_H_
 #define CUDA_LINALG_H_

-#include <stdint.h>
+#include "programmable_bootstrap.h"
+#include <cstdint>
+#include <device.h>

 extern "C" {

 void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
-                                          void const *lwe_array_in,
+                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count);
 void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
-                                          void const *lwe_array_in,
+                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
-                                       void const *lwe_array_in_1,
-                                       void const *lwe_array_in_2,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
-                                       void const *lwe_array_in_1,
-                                       void const *lwe_array_in_2,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *plaintext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *plaintext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *cleartext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void *cleartext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *cleartext_array_in,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void *cleartext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
 }

 #endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
@@ -1,7 +0,0 @@
-#ifndef CUDA_PBS_ENUMS_H
-#define CUDA_PBS_ENUMS_H
-
-enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
-enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
-
-#endif // CUDA_PBS_ENUMS_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -1,86 +0,0 @@
-#ifndef CUDA_BOOTSTRAP_H
-#define CUDA_BOOTSTRAP_H
-
-#include "pbs_enums.h"
-#include <stdint.h>
-
-extern "C" {
-void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
-                                 void const *input1, void const *input2,
-                                 void *output, uint32_t polynomial_size,
-                                 uint32_t total_polynomials);
-
-void cuda_convert_lwe_programmable_bootstrap_key_32(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size);
-
-void cuda_convert_lwe_programmable_bootstrap_key_64(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size);
-
-void scratch_cuda_programmable_bootstrap_amortized_32(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void scratch_cuda_programmable_bootstrap_amortized_64(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
-
-void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
-
-void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
-                                                   uint32_t gpu_index,
-                                                   int8_t **pbs_buffer);
-
-void scratch_cuda_programmable_bootstrap_32(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void scratch_cuda_programmable_bootstrap_64(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
-
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
-
-void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
-                                         int8_t **pbs_buffer);
-}
-#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -1,38 +0,0 @@
-#ifndef CUDA_MULTI_BIT_H
-#define CUDA_MULTI_BIT_H
-
-#include "pbs_enums.h"
-#include "stdint.h"
-
-extern "C" {
-
-bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples);
-
-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor);
-
-void scratch_cuda_multi_bit_programmable_bootstrap_64(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
-    uint32_t lut_stride);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
-                                                   uint32_t gpu_index,
-                                                   int8_t **pbs_buffer);
-}
-
-#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -1,10 +1,95 @@
-#ifndef CUDA_BOOTSTRAP_UTILITIES_H
-#define CUDA_BOOTSTRAP_UTILITIES_H
+#ifndef CUDA_BOOTSTRAP_H
+#define CUDA_BOOTSTRAP_H

 #include "device.h"
-#include "pbs_enums.h"
-#include "vector_types.h"
-#include <stdint.h>
+#include <cstdint>
+
+enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
+enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
+
+extern "C" {
+void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
+                                 void *input1, void *input2, void *output,
+                                 uint32_t polynomial_size,
+                                 uint32_t total_polynomials);
+
+void cuda_convert_lwe_programmable_bootstrap_key_32(
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
+void cuda_convert_lwe_programmable_bootstrap_key_64(
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
+void scratch_cuda_programmable_bootstrap_amortized_32(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void scratch_cuda_programmable_bootstrap_amortized_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
+                                                   uint32_t gpu_index,
+                                                   int8_t **pbs_buffer);
+
+void scratch_cuda_programmable_bootstrap_32(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void scratch_cuda_programmable_bootstrap_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer);
+
+uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count);
+
+uint64_t get_buffer_size_programmable_bootstrap_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count);
+}

 template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
@@ -250,36 +335,30 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
-    uint32_t lut_stride);
+    uint32_t level_count, uint32_t num_samples);

 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
-    uint32_t lut_stride);
+    uint32_t level_count, uint32_t num_samples);

 #if (CUDA_ARCH >= 900)
 template <typename Torus>
 void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
-    uint32_t lut_stride);
+    uint32_t level_count, uint32_t num_samples);

 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
@@ -334,4 +413,4 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(

 #endif

-#endif // CUDA_BOOTSTRAP_UTILITIES_H
+#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -1,7 +1,38 @@
-#ifndef CUDA_MULTI_BIT_UTILITIES_H
-#define CUDA_MULTI_BIT_UTILITIES_H
+#ifndef CUDA_MULTI_BIT_H
+#define CUDA_MULTI_BIT_H

-#include "pbs_utilities.h"
+#include "programmable_bootstrap.h"
+#include <cstdint>
+
+extern "C" {
+
+bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples);
+
+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);
+
+void scratch_cuda_multi_bit_programmable_bootstrap_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
+                                                   uint32_t gpu_index,
+                                                   int8_t **pbs_buffer);
+}

 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
@@ -16,19 +47,18 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
 template <typename Torus>
 void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 #endif

 template <typename Torus>
@@ -40,30 +70,27 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
 template <typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
@@ -93,17 +120,13 @@ template <typename Torus>
 uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size);

-template <typename Torus, class params>
-uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size);
-
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
  int8_t *d_mem_acc_step_two = NULL;
  int8_t *d_mem_acc_cg = NULL;
  int8_t *d_mem_acc_tbc = NULL;
-  uint32_t lwe_chunk_size;
+
  double2 *keybundle_fft;
  Torus *global_accumulator;
  double2 *global_accumulator_fft;
@@ -115,7 +138,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
    this->pbs_variant = pbs_variant;
-    this->lwe_chunk_size = lwe_chunk_size;
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

    // default
@@ -264,4 +286,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

-#endif // CUDA_MULTI_BIT_UTILITIES_H
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                            uint32_t polynomial_size);
+
+#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,3 +1,17 @@
+set(SOURCES
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -22,8 +22,8 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
 }

 void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
-                                 void *lwe_array_out, void const *glwe_array_in,
-                                 uint32_t const *nth_array, uint32_t num_nths,
+                                 void *lwe_array_out, void *glwe_array_in,
+                                 uint32_t *nth_array, uint32_t num_glwes,
                                 uint32_t glwe_dimension,
                                 uint32_t polynomial_size) {

@@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
  case 256:
    host_sample_extract<uint64_t, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  case 512:
    host_sample_extract<uint64_t, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  case 1024:
    host_sample_extract<uint64_t, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  case 2048:
    host_sample_extract<uint64_t, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  case 4096:
    host_sample_extract<uint64_t, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  case 8192:
    host_sample_extract<uint64_t, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  case 16384:
    host_sample_extract<uint64_t, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
        glwe_dimension);
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -27,9 +27,8 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
 }

 template <typename Torus, class params>
-__global__ void sample_extract(Torus *lwe_array_out, Torus const *glwe_array_in,
-                               uint32_t const *nth_array,
-                               uint32_t glwe_dimension) {
+__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
+                               uint32_t *nth_array, uint32_t glwe_dimension) {

  const int input_id = blockIdx.x;

@@ -39,11 +38,10 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus const *glwe_array_in,
  auto lwe_out = lwe_array_out + input_id * lwe_output_size;

  // We assume each GLWE will store the first polynomial_size inputs
-  uint32_t lwe_per_glwe = params::degree;
-  auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
+  uint32_t nth_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;

-  // nth is ensured to be in [0, lwe_per_glwe)
-  auto nth = nth_array[input_id] % lwe_per_glwe;
+  auto nth = nth_array[input_id];

  sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
  sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
@@ -51,13 +49,12 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus const *glwe_array_in,

 template <typename Torus, class params>
 __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
-                                  Torus *lwe_array_out,
-                                  Torus const *glwe_array_in,
-                                  uint32_t const *nth_array, uint32_t num_nths,
+                                  Torus *lwe_array_out, Torus *glwe_array_in,
+                                  uint32_t *nth_array, uint32_t num_glwes,
                                  uint32_t glwe_dimension) {
  cudaSetDevice(gpu_index);

-  dim3 grid(num_nths);
+  dim3 grid(num_glwes);
  dim3 thds(params::degree / params::opt);
  sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
      lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
+  cuda_keyswitch_lwe_ciphertext_vector(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
@@ -37,49 +37,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
 */
 void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+  cuda_keyswitch_lwe_ciphertext_vector(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_output_indexes),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(lwe_input_indexes),
-      static_cast<const uint64_t *>(ksk), lwe_dimension_in, lwe_dimension_out,
-      base_log, level_count, num_samples);
-}
-
-void scratch_packing_keyswitch_lwe_list_to_glwe_64(
-    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory) {
-  scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
-      glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
-}
-/* Perform functional packing keyswitch on a batch of 64 bits input LWE
- * ciphertexts.
- */
-void cuda_packing_keyswitch_lwe_list_to_glwe_64(
-    void *stream, uint32_t gpu_index, void *glwe_array_out,
-    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
-    uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
-    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_lwes) {
-
-  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(glwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-      input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-      base_log, level_count, num_lwes);
-}
-
-void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
-                                                uint32_t gpu_index,
-                                                int8_t **fp_ks_buffer) {
-  cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
+      static_cast<uint64_t *>(lwe_output_indexes),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -7,7 +7,6 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
-#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>
@@ -75,8 +74,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
                                            level_count);
      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);

-      for (int j = level_count - 1; j >= 0; j--) {
-        // Levels are stored in reverse order
+      for (int j = 0; j < level_count; j++) {
        auto ksk_block =
            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -100,12 +98,11 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
 }

 template <typename Torus>
-__host__ void host_keyswitch_lwe_ciphertext_vector(
+__host__ void cuda_keyswitch_lwe_ciphertext_vector(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
+    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  cudaSetDevice(gpu_index);

@@ -126,13 +123,13 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
 }

 template <typename Torus>
-void execute_keyswitch_async(cudaStream_t const *streams,
-                             uint32_t const *gpu_indexes, uint32_t gpu_count,
+void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count,
                             const LweArrayVariant<Torus> &lwe_array_out,
                             const LweArrayVariant<Torus> &lwe_output_indexes,
                             const LweArrayVariant<Torus> &lwe_array_in,
                             const LweArrayVariant<Torus> &lwe_input_indexes,
-                             Torus *const *ksks, uint32_t lwe_dimension_in,
+                             Torus **ksks, uint32_t lwe_dimension_in,
                             uint32_t lwe_dimension_out, uint32_t base_log,
                             uint32_t level_count, uint32_t num_samples) {

@@ -149,7 +146,7 @@ void execute_keyswitch_async(cudaStream_t const *streams,
        GET_VARIANT_ELEMENT(lwe_input_indexes, i);

    // Compute Keyswitch
-    host_keyswitch_lwe_ciphertext_vector<Torus>(
+    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
        streams[i], gpu_indexes[i], current_lwe_array_out,
        current_lwe_output_indexes, current_lwe_array_in,
        current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -157,160 +154,4 @@ void execute_keyswitch_async(cudaStream_t const *streams,
  }
 }

-template <typename Torus>
-__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
-    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-
-  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-
-  if (allocate_gpu_memory)
-    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
-        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
-        gpu_index);
-}
-
-// public functional packing keyswitch for a single LWE ciphertext
-//
-// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
-// different thread blocks at the x-axis to work on that input.
-template <typename Torus>
-__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
-    Torus *glwe_out, Torus const *lwe_in, Torus const *fp_ksk,
-    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
-
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  size_t glwe_size = (glwe_dimension + 1);
-
-  if (tid < glwe_size * polynomial_size) {
-    const int local_index = threadIdx.x;
-    // the output_glwe is split in polynomials and each x-block takes one of
-    // them
-    size_t poly_id = blockIdx.x;
-    size_t coef_per_block = blockDim.x;
-
-    // number of coefficients inside fp-ksk block for each lwe_input coefficient
-    size_t ksk_block_size = glwe_size * polynomial_size * level_count;
-
-    // initialize accumulator to 0
-    glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
-                        tid == glwe_dimension * polynomial_size);
-
-    // Iterate through all lwe elements
-    for (int i = 0; i < lwe_dimension_in; i++) {
-      // Round and prepare decomposition
-      Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
-
-      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-      Torus mod_b_mask = (1ll << base_log) - 1ll;
-
-      // block of key for current lwe coefficient (cur_input_lwe[i])
-      auto ksk_block = &fp_ksk[i * ksk_block_size];
-      for (int j = level_count - 1; j >= 0; j--) {
-        // Levels are stored in reverse order
-        auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
-        // Iterate through each level and multiply by the ksk piece
-        auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
-        Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
-        glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
-      }
-    }
-  }
-}
-
-// public functional packing keyswitch for a batch of LWE ciphertexts
-//
-// Selects the input each thread is working on using the y-block index.
-//
-// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
-// different thread blocks at the x-axis to work on that input.
-template <typename Torus>
-__global__ void packing_keyswitch_lwe_list_to_glwe(
-    Torus *glwe_array_out, Torus const *lwe_array_in, Torus const *fp_ksk,
-    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    Torus *d_mem) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-  const int lwe_size = (lwe_dimension_in + 1);
-
-  const int input_id = blockIdx.y;
-  const int degree = input_id;
-
-  // Select an input
-  auto lwe_in = lwe_array_in + input_id * lwe_size;
-  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
-  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
-  // KS LWE to GLWE
-  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext<Torus>(
-      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
-      polynomial_size, base_log, level_count);
-
-  // P * x ^degree
-  auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
-  auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
-  polynomial_accumulate_monic_monomial_mul<Torus>(out_result, in_poly, degree,
-                                                  tid % polynomial_size,
-                                                  polynomial_size, 1, true);
-}
-
-/// To-do: Rewrite this kernel for efficiency
-template <typename Torus>
-__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
-                                 uint32_t glwe_dimension,
-                                 uint32_t polynomial_size, uint32_t num_lwes) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < (glwe_dimension + 1) * polynomial_size) {
-    glwe_out[tid] = glwe_array_in[tid];
-
-    // Accumulate
-    for (int i = 1; i < num_lwes; i++) {
-      auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
-      glwe_out[tid] += glwe_in[tid];
-    }
-  }
-}
-
-template <typename Torus>
-__host__ void host_packing_keyswitch_lwe_list_to_glwe(
-    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
-    Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
-    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_lwes) {
-
-  if (num_lwes > polynomial_size)
-    PANIC("Cuda error: too many LWEs to pack. The number of LWEs should be "
-          "smaller than "
-          "polynomial_size.")
-
-  cudaSetDevice(gpu_index);
-  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-
-  int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
-
-  dim3 grid(num_blocks, num_lwes);
-  dim3 threads(num_threads);
-
-  auto d_mem = (Torus *)fp_ks_buffer;
-  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
-
-  // individually keyswitch each lwe
-  packing_keyswitch_lwe_list_to_glwe<Torus><<<grid, threads, 0, stream>>>(
-      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
-      glwe_dimension, polynomial_size, base_log, level_count, d_mem);
-  check_cuda_error(cudaGetLastError());
-
-  // accumulate to a single glwe
-  accumulate_glwes<Torus><<<num_blocks, threads, 0, stream>>>(
-      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
-      num_lwes);
-  check_cuda_error(cudaGetLastError());
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,16 +1,9 @@
 #ifndef CNCRT_TORUS_CUH
 #define CNCRT_TORUS_CUH

-#include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <limits>

-template <typename T>
-__host__ __device__ __forceinline__ constexpr double get_two_pow_torus_bits() {
-  return (sizeof(T) == 4) ? 4294967296.0 : 18446744073709551616.0;
-}
-
 template <typename T>
 __device__ inline void typecast_double_to_torus(double x, T &r) {
  r = T(x);
@@ -33,31 +26,23 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
  r = lll;
 }

-template <typename T>
-__device__ inline void typecast_double_round_to_torus(double x, T &r) {
-  constexpr double mx = get_two_pow_torus_bits<T>();
-  // floor must be used here because round has an issue with rounding .5,
-  // as it rounds away from zero.
-  double frac = x - floor(x);
-  frac *= mx;
-  typecast_double_to_torus(round(frac), r);
-}
-
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                              uint32_t level_count) {
-  const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
-  const T shift = non_rep_bit_count - 1;
+  T shift = sizeof(T) * 8 - level_count * base_log;
+  T mask = 1ll << (shift - 1);
+  T b = (x & mask) >> (shift - 1);
  T res = x >> shift;
-  res += 1;
-  res &= (T)(-2);
-  return res << shift;
+  res += b;
+  res <<= shift;
+  return res;
 }

 template <typename T>
 __device__ __forceinline__ void modulus_switch(T input, T &output,
                                               uint32_t log_modulus) {
  constexpr uint32_t BITS = sizeof(T) * 8;
+
  output = input + (((T)1) << (BITS - log_modulus - 1));
  output >>= (BITS - log_modulus);
 }
@@ -69,27 +54,4 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
  return output;
 }

-template <typename Torus>
-__global__ void modulus_switch_inplace(Torus *array, int size,
-                                       uint32_t log_modulus) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < size) {
-    array[tid] = modulus_switch(array[tid], log_modulus);
-  }
-}
-
-template <typename Torus>
-__host__ void host_modulus_switch_inplace(cudaStream_t stream,
-                                          uint32_t gpu_index, Torus *array,
-                                          int size, uint32_t log_modulus) {
-  cudaSetDevice(gpu_index);
-
-  int num_threads = 0, num_blocks = 0;
-  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
-
-  modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
-                                                                 log_modulus);
-  check_cuda_error(cudaGetLastError());
-}
-
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -113,7 +113,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
 }

 /// Copy memory within a GPU asynchronously
-void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
+void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
@@ -137,30 +137,6 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  }
 }

-/// Copy memory within a GPU
-void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                            uint32_t gpu_index) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
-  check_cuda_error(cudaSetDevice(gpu_index));
-  if (attr_src.device == attr_dest.device) {
-    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
-  } else {
-    check_cuda_error(
-        cudaMemcpyPeer(dest, attr_dest.device, src, attr_src.device, size));
-  }
-}
-
 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
@@ -201,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    int num_blocks = (n + block_size - 1) / block_size;

    // Launch the kernel
-    cuda_set_value_kernel<Torus>
-        <<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
+    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
+                                                                 n);
    check_cuda_error(cudaGetLastError());
  }
 }
@@ -274,7 +250,7 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
 #if CUDA_ARCH == 900
  max_shared_memory = 226000;
 #elif CUDA_ARCH == 890
-  max_shared_memory = 100000;
+  max_shared_memory = 127000;
 #elif CUDA_ARCH == 800
  max_shared_memory = 163000;
 #elif CUDA_ARCH == 700
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -6,7 +6,6 @@
 #include "twiddles.cuh"
 #include "types/complex/operations.cuh"

-using Index = unsigned;
 /*
 * Direct negacyclic FFT:
 *   - before the FFT the N real coefficients are stored into a
@@ -32,81 +31,290 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
   *  full loop, which should increase performance
   */

-  __syncthreads();
-  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
-  constexpr Index LOG2_DEGREE = params::log2_degree;
-  constexpr Index HALF_DEGREE = params::degree >> 1;
-  constexpr Index STRIDE = params::degree / params::opt;
-
-  Index tid = threadIdx.x;
-  double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
-
-  // load into registers
-#pragma unroll
-  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
-    u[i] = A[tid];
-    v[i] = A[tid + HALF_DEGREE];
-
-    tid += STRIDE;
-  }
-
+  size_t tid = threadIdx.x;
+  size_t twid_id;
+  size_t i1, i2;
+  double2 u, v, w;
  // level 1
  // we don't make actual complex multiplication on level1 since we have only
  // one twiddle, it's real and image parts are equal, so we can multiply
  // it with simpler operations
 #pragma unroll
-  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
-    w = v[i] * (double2){0.707106781186547461715008466854,
-                         0.707106781186547461715008466854};
-    v[i] = u[i] - w;
-    u[i] = u[i] + w;
-  }
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    i1 = tid;
+    i2 = tid + params::degree / 2;

-  Index twiddle_shift = 1;
-  for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
-    Index lane_mask = 1 << (l - 1);
-    Index thread_mask = (1 << l) - 1;
-    twiddle_shift <<= 1;
+    u = A[i1];
+    v = A[i2] * (double2){0.707106781186547461715008466854,
+                          0.707106781186547461715008466854};

-    tid = threadIdx.x;
-    __syncthreads();
-#pragma unroll
-    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
-      Index rank = tid & thread_mask;
-      bool u_stays_in_register = rank < lane_mask;
-      A[tid] = (u_stays_in_register) ? v[i] : u[i];
-      tid = tid + STRIDE;
-    }
-    __syncthreads();
+    A[i1] += v;
+    A[i2] = u - v;

-    tid = threadIdx.x;
-#pragma unroll
-    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
-      Index rank = tid & thread_mask;
-      bool u_stays_in_register = rank < lane_mask;
-      w = A[tid ^ lane_mask];
-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
-      w = negtwiddles[tid / lane_mask + twiddle_shift];
-
-      w *= v[i];
-
-      v[i] = u[i] - w;
-      u[i] = u[i] + w;
-      tid = tid + STRIDE;
-    }
+    tid += params::degree / params::opt;
  }
  __syncthreads();

-  // store registers in SM
+  // level 2
+  // from this level there are more than one twiddles and none of them has equal
+  // real and imag parts, so complete complex multiplication is needed
+  // for each level params::degree / 2^level represents number of coefficients
+  // inside divided chunk of specific level
+  //
  tid = threadIdx.x;
 #pragma unroll
-  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
-    A[tid * 2] = u[i];
-    A[tid * 2 + 1] = v[i];
-    tid = tid + STRIDE;
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 4);
+    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
+    i2 = i1 + params::degree / 4;
+
+    w = negtwiddles[twid_id + 2];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
  }
  __syncthreads();
+
+  // level 3
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 8);
+    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
+    i2 = i1 + params::degree / 8;
+
+    w = negtwiddles[twid_id + 4];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 4
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 16);
+    i1 =
+        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
+    i2 = i1 + params::degree / 16;
+
+    w = negtwiddles[twid_id + 8];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 5
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 32);
+    i1 =
+        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
+    i2 = i1 + params::degree / 32;
+
+    w = negtwiddles[twid_id + 16];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 6
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 64);
+    i1 =
+        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
+    i2 = i1 + params::degree / 64;
+
+    w = negtwiddles[twid_id + 32];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 7
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 128);
+    i1 = 2 * (params::degree / 128) * twid_id +
+         (tid & (params::degree / 128 - 1));
+    i2 = i1 + params::degree / 128;
+
+    w = negtwiddles[twid_id + 64];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // from level 8, we need to check size of params degree, because we support
+  // minimum actual polynomial size = 256,  when compressed size is halfed and
+  // minimum supported compressed size is 128, so we always need first 7
+  // levels of butterfly operation, since butterfly levels are hardcoded
+  // we need to check if polynomial size is big enough to require specific level
+  // of butterfly.
+  if constexpr (params::degree >= 256) {
+    // level 8
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 256);
+      i1 = 2 * (params::degree / 256) * twid_id +
+           (tid & (params::degree / 256 - 1));
+      i2 = i1 + params::degree / 256;
+
+      w = negtwiddles[twid_id + 128];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 512) {
+    // level 9
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 512);
+      i1 = 2 * (params::degree / 512) * twid_id +
+           (tid & (params::degree / 512 - 1));
+      i2 = i1 + params::degree / 512;
+
+      w = negtwiddles[twid_id + 256];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 1024) {
+    // level 10
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 1024);
+      i1 = 2 * (params::degree / 1024) * twid_id +
+           (tid & (params::degree / 1024 - 1));
+      i2 = i1 + params::degree / 1024;
+
+      w = negtwiddles[twid_id + 512];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 2048) {
+    // level 11
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 2048);
+      i1 = 2 * (params::degree / 2048) * twid_id +
+           (tid & (params::degree / 2048 - 1));
+      i2 = i1 + params::degree / 2048;
+
+      w = negtwiddles[twid_id + 1024];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 4096) {
+    // level 12
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 4096);
+      i1 = 2 * (params::degree / 4096) * twid_id +
+           (tid & (params::degree / 4096 - 1));
+      i2 = i1 + params::degree / 4096;
+
+      w = negtwiddles[twid_id + 2048];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 8192) {
+    // level 13
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 8192);
+      i1 = 2 * (params::degree / 8192) * twid_id +
+           (tid & (params::degree / 8192 - 1));
+      i2 = i1 + params::degree / 8192;
+
+      w = negtwiddles[twid_id + 4096];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
 }

 /*
@@ -121,82 +329,284 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
   *  full loop, which should increase performance
   */

-  __syncthreads();
-  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
-  constexpr Index LOG2_DEGREE = params::log2_degree;
-  constexpr Index DEGREE = params::degree;
-  constexpr Index HALF_DEGREE = params::degree >> 1;
-  constexpr Index STRIDE = params::degree / params::opt;
-
  size_t tid = threadIdx.x;
-  double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
+  size_t twid_id;
+  size_t i1, i2;
+  double2 u, w;

-  // load into registers and divide by compressed polynomial size
-#pragma unroll
-  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
-    u[i] = A[2 * tid];
-    v[i] = A[2 * tid + 1];
-
-    u[i] /= DEGREE;
-    v[i] /= DEGREE;
-
-    tid += STRIDE;
-  }
-
-  Index twiddle_shift = DEGREE;
-  for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
-    Index lane_mask = 1 << (l - 1);
-    Index thread_mask = (1 << l) - 1;
-    tid = threadIdx.x;
-    twiddle_shift >>= 1;
-
-    // at this point registers are ready for the  butterfly
-    tid = threadIdx.x;
-    __syncthreads();
-#pragma unroll
-    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
-      w = (u[i] - v[i]);
-      u[i] += v[i];
-      v[i] = w * conjugate(negtwiddles[tid / lane_mask + twiddle_shift]);
-
-      // keep one of the register for next iteration and store another one in sm
-      Index rank = tid & thread_mask;
-      bool u_stays_in_register = rank < lane_mask;
-      A[tid] = (u_stays_in_register) ? v[i] : u[i];
-
-      tid = tid + STRIDE;
-    }
-    __syncthreads();
-
-    // prepare registers for next butterfly iteration
-    tid = threadIdx.x;
-#pragma unroll
-    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
-      Index rank = tid & thread_mask;
-      bool u_stays_in_register = rank < lane_mask;
-      w = A[tid ^ lane_mask];
-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
-
-      tid = tid + STRIDE;
-    }
-  }
-
-  // last iteration
-  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
-    w = (u[i] - v[i]);
-    u[i] = u[i] + v[i];
-    v[i] = w * (double2){0.707106781186547461715008466854,
-                         -0.707106781186547461715008466854};
+  // divide input by compressed polynomial size
+  tid = threadIdx.x;
+  for (size_t i = 0; i < params::opt; ++i) {
+    A[tid] /= params::degree;
+    tid += params::degree / params::opt;
  }
  __syncthreads();
-  // store registers in SM
+
+  // none of the twiddles have equal real and imag part, so
+  // complete complex multiplication has to be done
+  // here we have more than one twiddle
+  // mapping in backward fft is reversed
+  // butterfly operation is started from last level
+
+  if constexpr (params::degree >= 8192) {
+    // level 13
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 8192);
+      i1 = 2 * (params::degree / 8192) * twid_id +
+           (tid & (params::degree / 8192 - 1));
+      i2 = i1 + params::degree / 8192;
+
+      w = negtwiddles[twid_id + 4096];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 4096) {
+    // level 12
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 4096);
+      i1 = 2 * (params::degree / 4096) * twid_id +
+           (tid & (params::degree / 4096 - 1));
+      i2 = i1 + params::degree / 4096;
+
+      w = negtwiddles[twid_id + 2048];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 2048) {
+    // level 11
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 2048);
+      i1 = 2 * (params::degree / 2048) * twid_id +
+           (tid & (params::degree / 2048 - 1));
+      i2 = i1 + params::degree / 2048;
+
+      w = negtwiddles[twid_id + 1024];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 1024) {
+    // level 10
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 1024);
+      i1 = 2 * (params::degree / 1024) * twid_id +
+           (tid & (params::degree / 1024 - 1));
+      i2 = i1 + params::degree / 1024;
+
+      w = negtwiddles[twid_id + 512];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 512) {
+    // level 9
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 512);
+      i1 = 2 * (params::degree / 512) * twid_id +
+           (tid & (params::degree / 512 - 1));
+      i2 = i1 + params::degree / 512;
+
+      w = negtwiddles[twid_id + 256];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 256) {
+    // level 8
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 256);
+      i1 = 2 * (params::degree / 256) * twid_id +
+           (tid & (params::degree / 256 - 1));
+      i2 = i1 + params::degree / 256;
+
+      w = negtwiddles[twid_id + 128];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  // below level 8, we don't need to check size of params degree, because we
+  // support minimum actual polynomial size = 256,  when compressed size is
+  // halfed and minimum supported compressed size is 128, so we always need
+  // last 7 levels of butterfly operation, since butterfly levels are hardcoded
+  // we don't need to check if polynomial size is big enough to require
+  // specific level of butterfly.
+  // level 7
  tid = threadIdx.x;
 #pragma unroll
-  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
-    A[tid] = u[i];
-    A[tid + HALF_DEGREE] = v[i];
-    tid = tid + STRIDE;
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 128);
+    i1 = 2 * (params::degree / 128) * twid_id +
+         (tid & (params::degree / 128 - 1));
+    i2 = i1 + params::degree / 128;
+
+    w = negtwiddles[twid_id + 64];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 6
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 64);
+    i1 =
+        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
+    i2 = i1 + params::degree / 64;
+
+    w = negtwiddles[twid_id + 32];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 5
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 32);
+    i1 =
+        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
+    i2 = i1 + params::degree / 32;
+
+    w = negtwiddles[twid_id + 16];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 4
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 16);
+    i1 =
+        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
+    i2 = i1 + params::degree / 16;
+
+    w = negtwiddles[twid_id + 8];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 3
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 8);
+    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
+    i2 = i1 + params::degree / 8;
+
+    w = negtwiddles[twid_id + 4];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 2
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 4);
+    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
+    i2 = i1 + params::degree / 4;
+
+    w = negtwiddles[twid_id + 2];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 1
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 2);
+    i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
+    i2 = i1 + params::degree / 2;
+
+    w = negtwiddles[twid_id + 1];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
  }
  __syncthreads();
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
@@ -1,8 +1,8 @@
 #include "integer/addition.cuh"

 void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
@@ -23,10 +23,9 @@ void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
 }

 void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
+    void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks) {

  auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
@@ -34,13 +33,13 @@ void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(

  host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lhs), static_cast<uint64_t const *>(rhs),
-      static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t *const *)(ksks),
-      mem, num_blocks);
+      static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+      static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+      num_blocks);
 }

-void cleanup_signed_overflowing_add_or_sub(void *const *streams,
-                                           uint32_t const *gpu_indexes,
+void cleanup_signed_overflowing_add_or_sub(void **streams,
+                                           uint32_t *gpu_indexes,
                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void) {
  int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -3,13 +3,13 @@

 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "integer.h"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
-#include "integer/integer_utilities.h"
 #include "integer/negation.cuh"
 #include "integer/scalar_shifts.cuh"
 #include "linear_algebra.h"
-#include "pbs/programmable_bootstrap.h"
+#include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
@@ -20,11 +20,10 @@

 template <typename Torus>
 void host_resolve_signed_overflow(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation,
-    Torus const *last_block_input_carry, Torus *last_block_output_carry,
-    int_resolve_signed_overflow_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *result, Torus *last_block_inner_propagation,
+    Torus *last_block_input_carry, Torus *last_block_output_carry,
+    int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {

  auto x = mem->x;

@@ -38,12 +37,12 @@ void host_resolve_signed_overflow(
      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
      mem->params.big_lwe_dimension, 1);

-  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                       last_block_inner_propagation, x,
-                       mem->params.big_lwe_dimension, 1);
-  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                       last_block_inner_propagation, last_block_input_carry,
-                       mem->params.big_lwe_dimension, 1);
+  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
+                1);
+  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                last_block_inner_propagation, last_block_input_carry,
+                mem->params.big_lwe_dimension, 1);

  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
                                      last_block_inner_propagation,
@@ -54,8 +53,7 @@ void host_resolve_signed_overflow(

 template <typename Torus>
 __host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
    uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
    bool allocate_gpu_memory) {
@@ -71,9 +69,9 @@ __host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
 */
 template <typename Torus>
 __host__ void host_integer_signed_overflowing_add_or_sub_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed,
-    SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
+    uint64_t **ksks,
    int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks) {

@@ -96,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(

  // phase 1
  if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
-                         big_lwe_dimension, num_blocks);
+    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
+                  big_lwe_dimension, num_blocks);
  } else {
-    host_integer_radix_negation<Torus>(
+    host_integer_radix_negation(
        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                         big_lwe_dimension, num_blocks);
+    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                  big_lwe_dimension, num_blocks);
  }

  // phase 2
@@ -111,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }

-  host_propagate_single_carry<Torus>(
-      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
-      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation<Torus>(
+  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+                              result, output_carry, input_carries,
+                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -128,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
  // phase 3
  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];

-  host_resolve_signed_overflow<Torus>(
+  host_resolve_signed_overflow(
      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,8 +1,8 @@
 #include "integer/bitwise_ops.cuh"

 void scratch_cuda_integer_radix_bitop_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
@@ -21,23 +21,21 @@ void scratch_cuda_integer_radix_bitop_kb_64(
 }

 void cuda_bitop_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitop_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_1),
-      static_cast<const uint64_t *>(lwe_array_2),
+      static_cast<uint64_t *>(lwe_array_1),
+      static_cast<uint64_t *>(lwe_array_2),
      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_bitop(void *const *streams,
-                                uint32_t const *gpu_indexes, uint32_t gpu_count,
-                                int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -4,7 +4,7 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.cuh"
-#include "integer/integer_utilities.h"
+#include "integer.h"
 #include "pbs/programmable_bootstrap_classic.cuh"
 #include "pbs/programmable_bootstrap_multibit.cuh"
 #include "polynomial/functions.cuh"
@@ -12,11 +12,12 @@
 #include <omp.h>

 template <typename Torus>
-__host__ void host_integer_radix_bitop_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
-    Torus const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                            uint32_t gpu_count, Torus *lwe_array_out,
+                            Torus *lwe_array_1, Torus *lwe_array_2,
+                            int_bitop_buffer<Torus> *mem_ptr, void **bsks,
+                            Torus **ksks, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;

@@ -27,10 +28,9 @@ __host__ void host_integer_radix_bitop_kb(

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_bitop_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_bitop_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_bitop_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) {

  *mem_ptr =
      new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op, params,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,8 +1,8 @@
 #include "integer/cmux.cuh"

 void scratch_cuda_integer_radix_cmux_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
@@ -17,31 +17,30 @@ void scratch_cuda_integer_radix_cmux_kb_64(
  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  scratch_cuda_integer_radix_cmux_kb<uint64_t>(
+  scratch_cuda_integer_radix_cmux_kb(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
-    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
+    void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t lwe_ciphertext_count) {

  host_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_condition),
-      static_cast<const uint64_t *>(lwe_array_true),
-      static_cast<const uint64_t *>(lwe_array_false),
+      static_cast<uint64_t *>(lwe_condition),
+      static_cast<uint64_t *>(lwe_array_true),
+      static_cast<uint64_t *>(lwe_array_false),
      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),

      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_radix_cmux(void *const *streams,
-                                     uint32_t const *gpu_indexes,
+void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -4,13 +4,12 @@
 #include "integer.cuh"

 template <typename Torus>
-__host__ void zero_out_if(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes, uint32_t gpu_count,
-                          Torus *lwe_array_out, Torus const *lwe_array_input,
-                          Torus const *lwe_condition,
+__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
+                          uint32_t gpu_count, Torus *lwe_array_out,
+                          Torus *lwe_array_input, Torus *lwe_condition,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
-                          int_radix_lut<Torus> *predicate, void *const *bsks,
-                          Torus *const *ksks, uint32_t num_radix_blocks) {
+                          int_radix_lut<Torus> *predicate, void **bsks,
+                          Torus **ksks, uint32_t num_radix_blocks) {
  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

@@ -28,11 +27,10 @@ __host__ void zero_out_if(cudaStream_t const *streams,
    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;

-    device_pack_bivariate_blocks<Torus>
-        <<<num_blocks, num_threads, 0, streams[0]>>>(
-            lwe_array_out_block, predicate->lwe_indexes_in,
-            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
-            params.big_lwe_dimension, params.message_modulus, 1);
+    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
+        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
+        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
+        params.message_modulus, 1);
    check_cuda_error(cudaGetLastError());
  }

@@ -43,11 +41,10 @@ __host__ void zero_out_if(cudaStream_t const *streams,

 template <typename Torus>
 __host__ void host_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
-    Torus const *lwe_array_true, Torus const *lwe_array_false,
-    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
-    uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
+    Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks) {

  auto params = mem_ptr->params;

@@ -60,15 +57,13 @@ __host__ void host_integer_radix_cmux_kb(
  }

  auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-                     lwe_array_true, lwe_condition, mem_true,
-                     mem_ptr->inverted_predicate_lut, bsks, ksks,
-                     num_radix_blocks);
+  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+              lwe_array_true, lwe_condition, mem_true,
+              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
  auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
-                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
-                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
-                     num_radix_blocks);
+  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
+              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
+              bsks, ksks, num_radix_blocks);
  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
  }
@@ -80,9 +75,9 @@ __host__ void host_integer_radix_cmux_kb(
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
-                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
-                       params.big_lwe_dimension, num_radix_blocks);
+  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
+                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
+                num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
@@ -91,8 +86,8 @@ __host__ void host_integer_radix_cmux_kb(

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_cmux_buffer<Torus> **mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_cmux_buffer<Torus> **mem_ptr,
    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,8 +1,8 @@
 #include "integer/comparison.cuh"

 void scratch_cuda_integer_radix_comparison_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
@@ -37,10 +37,9 @@ void scratch_cuda_integer_radix_comparison_kb_64(
 }

 void cuda_comparison_integer_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_radix_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_radix_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -50,9 +49,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    host_integer_radix_equality_check_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_1),
-        static_cast<const uint64_t *>(lwe_array_2), buffer, bsks,
-        (uint64_t **)(ksks), num_radix_blocks);
+        static_cast<uint64_t *>(lwe_array_1),
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
+        num_radix_blocks);
    break;
  case GT:
  case GE:
@@ -61,8 +60,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    host_integer_radix_difference_check_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_1),
-        static_cast<const uint64_t *>(lwe_array_2), buffer,
+        static_cast<uint64_t *>(lwe_array_1),
+        static_cast<uint64_t *>(lwe_array_2), buffer,
        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
        num_radix_blocks);
    break;
@@ -71,17 +70,16 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    host_integer_radix_maxmin_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_1),
-        static_cast<const uint64_t *>(lwe_array_2), buffer, bsks,
-        (uint64_t **)(ksks), num_radix_blocks);
+        static_cast<uint64_t *>(lwe_array_1),
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
+        num_radix_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
  }
 }

-void cleanup_cuda_integer_comparison(void *const *streams,
-                                     uint32_t const *gpu_indexes,
+void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {

--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -4,8 +4,8 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.cuh"
+#include "integer.h"
 #include "integer/cmux.cuh"
-#include "integer/integer_utilities.h"
 #include "integer/negation.cuh"
 #include "integer/scalar_addition.cuh"
 #include "pbs/programmable_bootstrap_classic.cuh"
@@ -16,9 +16,9 @@
 // lwe_dimension + 1 threads
 // todo: This kernel MUST be refactored to a binary reduction
 template <typename Torus>
-__global__ void
-device_accumulate_all_blocks(Torus *output, Torus const *input_block,
-                             uint32_t lwe_dimension, uint32_t num_blocks) {
+__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
+                                             uint32_t lwe_dimension,
+                                             uint32_t num_blocks) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < lwe_dimension + 1) {
    auto block = &input_block[idx];
@@ -34,7 +34,7 @@ device_accumulate_all_blocks(Torus *output, Torus const *input_block,

 template <typename Torus>
 __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
-                                    Torus *output, Torus const *input,
+                                    Torus *output, Torus *input,
                                    uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  // Add all blocks and store in sum
-  device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
+  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
      output, input, lwe_dimension, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }
@@ -57,11 +57,12 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
 */
 template <typename Torus>
 __host__ void are_all_comparisons_block_true(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -74,7 +75,7 @@ __host__ void are_all_comparisons_block_true(
  auto tmp_out = are_all_block_true_buffer->tmp_out;

  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
+  uint32_t max_value = total_modulus - 1;

  cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
                               num_radix_blocks * (big_lwe_dimension + 1) *
@@ -95,9 +96,8 @@ __host__ void are_all_comparisons_block_true(
    auto is_equal_to_num_blocks_map =
        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
-                                   input_blocks, big_lwe_dimension,
-                                   chunk_length);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
+                            input_blocks, big_lwe_dimension, chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -121,8 +121,9 @@ __host__ void are_all_comparisons_block_true(
            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                     max_value, num_radix_blocks, true);

-        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
-          return x == chunk_length;
+        auto is_equal_to_num_blocks_lut_f = [max_value,
+                                             chunk_length](Torus x) -> Torus {
+          return (x & max_value) == chunk_length;
        };
        generate_device_accumulator<Torus>(
            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
@@ -159,11 +160,12 @@ __host__ void are_all_comparisons_block_true(
 */
 template <typename Torus>
 __host__ void is_at_least_one_comparisons_block_true(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -172,7 +174,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;

  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
+  uint32_t max_value = total_modulus - 1;

  cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
                               num_radix_blocks * (big_lwe_dimension + 1) *
@@ -190,9 +192,8 @@ __host__ void is_at_least_one_comparisons_block_true(
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
-                                   input_blocks, big_lwe_dimension,
-                                   chunk_length);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
+                            input_blocks, big_lwe_dimension, chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -239,11 +240,10 @@ __host__ void is_at_least_one_comparisons_block_true(
 // are_all_comparisons_block_true
 template <typename Torus>
 __host__ void host_compare_with_zero_equality(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, int32_t num_radix_blocks,
-    int_radix_lut<Torus> *zero_comparison) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -280,8 +280,8 @@ __host__ void host_compare_with_zero_equality(
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

-      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
-                                   big_lwe_dimension, chunk_size);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
+                            big_lwe_dimension, chunk_size);

      num_sum_blocks++;
      remainder_blocks -= (chunk_size - 1);
@@ -295,23 +295,22 @@ __host__ void host_compare_with_zero_equality(
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
-  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
-                                        lwe_array_out, sum, mem_ptr, bsks, ksks,
-                                        num_sum_blocks);
+  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_equality_check_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
-    Torus const *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

  auto eq_buffer = mem_ptr->eq_buffer;

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table_kb(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);
@@ -320,17 +319,18 @@ __host__ void host_integer_radix_equality_check_kb(
  //
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
-                                        lwe_array_out, comparisons, mem_ptr,
-                                        bsks, ksks, num_radix_blocks);
+  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                 comparisons, mem_ptr, bsks, ksks,
+                                 num_radix_blocks);
 }

 template <typename Torus>
-__host__ void compare_radix_blocks_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
-    Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *lwe_array_out,
+                        Torus *lwe_array_left, Torus *lwe_array_right,
+                        int_comparison_buffer<Torus> *mem_ptr, void **bsks,
+                        Torus **ksks, uint32_t num_radix_blocks) {

  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -352,20 +352,19 @@ __host__ void compare_radix_blocks_kb(

  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
-                          lwe_array_left, lwe_array_right, big_lwe_dimension,
-                          num_radix_blocks);
+  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
+                   lwe_array_right, big_lwe_dimension, num_radix_blocks);

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace<Torus>(
+  host_integer_radix_add_scalar_one_inplace(
      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
      num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -374,12 +373,13 @@ __host__ void compare_radix_blocks_kb(
 // (inferior, equal, superior) to one single shortint block containing the
 // final sign
 template <typename Torus>
-__host__ void tree_sign_reduction(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_block_comparisons,
-    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
-    std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
+                    uint32_t gpu_count, Torus *lwe_array_out,
+                    Torus *lwe_block_comparisons,
+                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
+                    std::function<Torus(Torus)> sign_handler_f, void **bsks,
+                    Torus **ksks, uint32_t num_radix_blocks) {

  auto params = tree_buffer->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -406,8 +406,8 @@ __host__ void tree_sign_reduction(

  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
-    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                       partial_block_count, 4);
+    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -433,8 +433,8 @@ __host__ void tree_sign_reduction(
  std::function<Torus(Torus)> f;

  if (partial_block_count == 2) {
-    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                       partial_block_count, 4);
+    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                partial_block_count, 4);

    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
      int msb = (x >> 2) & 3;
@@ -454,18 +454,18 @@ __host__ void tree_sign_reduction(
  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
-      last_lut);
+  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                 gpu_count, lwe_array_out, y,
+                                                 bsks, ksks, 1, last_lut);
 }

 template <typename Torus>
 __host__ void host_integer_radix_difference_check_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
-    Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
+    int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> reduction_lut_f, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

  auto diff_buffer = mem_ptr->diff_buffer;

@@ -476,8 +476,8 @@ __host__ void host_integer_radix_difference_check_kb(
  auto carry_modulus = params.carry_modulus;

  uint32_t packed_num_radix_blocks = num_radix_blocks;
-  Torus *lhs = (Torus *)lwe_array_left;
-  Torus *rhs = (Torus *)lwe_array_right;
+  auto lhs = lwe_array_left;
+  auto rhs = lwe_array_right;
  if (carry_modulus >= message_modulus) {
    // Packing is possible
    // Pack inputs
@@ -488,21 +488,19 @@ __host__ void host_integer_radix_difference_check_kb(
    if (mem_ptr->is_signed) {
      packed_num_radix_blocks -= 2;
    }
-    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
-                       big_lwe_dimension, packed_num_radix_blocks,
-                       message_modulus);
-    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
-                       lwe_array_right, big_lwe_dimension,
-                       packed_num_radix_blocks, message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
+                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
    packed_num_radix_blocks /= 2;

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        packed_num_radix_blocks, identity_lut);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb(
        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
        packed_num_radix_blocks, identity_lut);

@@ -519,17 +517,16 @@ __host__ void host_integer_radix_difference_check_kb(
  if (!mem_ptr->is_signed) {
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
-    compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
-                                   lhs, rhs, mem_ptr, bsks, ksks,
-                                   packed_num_radix_blocks);
+    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
+                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
-                                     comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                     packed_num_radix_blocks);
+      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
+                              rhs, mem_ptr, bsks, ksks,
+                              packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -538,21 +535,21 @@ __host__ void host_integer_radix_difference_check_kb(
      Torus *last_right_block_before_sign_block =
          diff_buffer->tmp_packed_right +
          packed_num_radix_blocks * big_lwe_size;
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
          1, identity_lut);
-      compare_radix_blocks_kb<Torus>(
+      compare_radix_blocks_kb(
          streams, gpu_indexes, gpu_count,
          comparisons + packed_num_radix_blocks * big_lwe_size,
          last_left_block_before_sign_block, last_right_block_before_sign_block,
          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -561,11 +558,11 @@ __host__ void host_integer_radix_difference_check_kb(
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb<Torus>(
-          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
-          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
+      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
+                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
+                              ksks, num_radix_blocks - 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -578,17 +575,17 @@ __host__ void host_integer_radix_difference_check_kb(
  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             comparisons, mem_ptr->diff_buffer->tree_buffer,
-                             reduction_lut_f, bsks, ksks, num_comparisons);
+  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
+                      comparisons, mem_ptr->diff_buffer->tree_buffer,
+                      reduction_lut_f, bsks, ksks, num_comparisons);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_comparison_check_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_comparison_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
-    bool is_signed, bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, COMPARISON_TYPE op, bool is_signed,
+    bool allocate_gpu_memory) {

  *mem_ptr = new int_comparison_buffer<Torus>(streams, gpu_indexes, gpu_count,
                                              op, params, num_radix_blocks,
@@ -596,23 +593,24 @@ __host__ void scratch_cuda_integer_radix_comparison_check_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_maxmin_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left,
-    Torus const *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks) {
+__host__ void
+host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count, Torus *lwe_array_out,
+                             Torus *lwe_array_left, Torus *lwe_array_right,
+                             int_comparison_buffer<Torus> *mem_ptr, void **bsks,
+                             Torus **ksks, uint32_t total_num_radix_blocks) {

  // Compute the sign
-  host_integer_radix_difference_check_kb<Torus>(
+  host_integer_radix_difference_check_kb(
      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
      ksks, total_num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out,
-      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
-      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
+  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
+                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
+                             total_num_radix_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -1,90 +0,0 @@
-#include "compression.cuh"
-
-void scratch_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory) {
-
-  int_radix_params compression_params(
-      pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      (compression_glwe_dimension + 1) * compression_polynomial_size,
-      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
-      carry_modulus);
-
-  scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
-      compression_params, lwe_per_glwe, storage_log_modulus,
-      allocate_gpu_memory);
-}
-void scratch_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
-    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t storage_log_modulus, uint32_t body_count,
-    bool allocate_gpu_memory) {
-
-  // Decompression doesn't keyswitch, so big and small dimensions are the same
-  int_radix_params encryption_params(
-      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
-      message_modulus, carry_modulus);
-
-  int_radix_params compression_params(
-      pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
-      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
-
-  scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
-      encryption_params, compression_params, storage_log_modulus,
-      allocate_gpu_memory);
-}
-void cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
-    uint32_t num_nths, int8_t *mem_ptr) {
-
-  host_integer_compress<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(glwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in), (uint64_t *const *)(fp_ksk),
-      num_nths, (int_compression<uint64_t> *)mem_ptr);
-}
-void cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
-    uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) {
-
-  host_integer_decompress<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(glwe_in), indexes_array, indexes_array_size,
-      bsks, (int_decompression<uint64_t> *)mem_ptr);
-}
-
-void cleanup_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_compression<uint64_t> *mem_ptr =
-      (int_compression<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
-
-void cleanup_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_decompression<uint64_t> *mem_ptr =
-      (int_decompression<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -1,382 +0,0 @@
-#ifndef CUDA_INTEGER_COMPRESSION_CUH
-#define CUDA_INTEGER_COMPRESSION_CUH
-
-#include "ciphertext.h"
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer/compression/compression.h"
-#include "integer/compression/compression_utilities.h"
-#include "integer/integer.cuh"
-#include "linearalgebra/multiplication.cuh"
-#include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
-
-template <typename Torus>
-__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
-                     uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
-  auto nbits = sizeof(Torus) * 8;
-  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  auto glwe_index = tid / out_len;
-  auto i = tid % out_len;
-  auto chunk_array_in = array_in + glwe_index * in_len;
-  auto chunk_array_out = array_out + glwe_index * out_len;
-
-  if (tid < num_coeffs) {
-
-    auto k = nbits * i / log_modulus;
-    auto j = k;
-
-    auto start_shift = i * nbits - j * log_modulus;
-
-    auto value = chunk_array_in[j] >> start_shift;
-    j++;
-
-    while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
-      auto shift = j * log_modulus - i * nbits;
-      value |= chunk_array_in[j] << shift;
-      j++;
-    }
-
-    chunk_array_out[i] = value;
-  }
-}
-
-template <typename Torus>
-__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
-                        Torus *array_out, Torus *array_in, uint32_t num_glwes,
-                        uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
-  if (array_in == array_out)
-    PANIC("Cuda error: Input and output must be different");
-
-  cudaSetDevice(gpu_index);
-  auto compression_params = mem_ptr->compression_params;
-
-  auto log_modulus = mem_ptr->storage_log_modulus;
-  // [0..num_glwes-1) GLWEs
-  auto in_len = (compression_params.glwe_dimension + 1) *
-                compression_params.polynomial_size;
-  auto number_bits_to_pack = in_len * log_modulus;
-  auto nbits = sizeof(Torus) * 8;
-  // number_bits_to_pack.div_ceil(Scalar::BITS)
-  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
-
-  // Last GLWE
-  number_bits_to_pack = in_len * log_modulus;
-  auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
-
-  auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
-
-  int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
-
-  dim3 grid(num_blocks);
-  dim3 threads(num_threads);
-  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
-                                            num_coeffs, in_len, out_len);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus>
-__host__ void
-host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                      uint32_t gpu_count, Torus *glwe_array_out,
-                      Torus const *lwe_array_in, Torus *const *fp_ksk,
-                      uint32_t num_radix_blocks,
-                      int_compression<Torus> *mem_ptr) {
-
-  auto compression_params = mem_ptr->compression_params;
-  auto input_lwe_dimension = compression_params.small_lwe_dimension;
-
-  // Shift
-  auto lwe_shifted = mem_ptr->tmp_lwe;
-  host_cleartext_multiplication<Torus>(
-      streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
-      (uint64_t)compression_params.message_modulus, input_lwe_dimension,
-      num_radix_blocks);
-
-  uint32_t lwe_in_size = input_lwe_dimension + 1;
-  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
-                           compression_params.polynomial_size;
-  uint32_t num_glwes_for_compression =
-      num_radix_blocks / mem_ptr->lwe_per_glwe + 1;
-
-  // Keyswitch LWEs to GLWE
-  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
-  cuda_memset_async(tmp_glwe_array_out, 0,
-                    num_glwes_for_compression *
-                        (compression_params.glwe_dimension + 1) *
-                        compression_params.polynomial_size * sizeof(Torus),
-                    streams[0], gpu_indexes[0]);
-  auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
-  auto rem_lwes = num_radix_blocks;
-
-  auto lwe_subset = lwe_shifted;
-  auto glwe_out = tmp_glwe_array_out;
-  while (rem_lwes > 0) {
-    auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
-
-    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
-        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-        compression_params.polynomial_size, compression_params.ks_base_log,
-        compression_params.ks_level, chunk_size);
-
-    rem_lwes -= chunk_size;
-    lwe_subset += chunk_size * lwe_in_size;
-    glwe_out += glwe_out_size;
-  }
-
-  // Modulus switch
-  host_modulus_switch_inplace<Torus>(
-      streams[0], gpu_indexes[0], tmp_glwe_array_out,
-      num_glwes_for_compression * (compression_params.glwe_dimension + 1) *
-          compression_params.polynomial_size,
-      mem_ptr->storage_log_modulus);
-
-  host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
-                   tmp_glwe_array_out, num_glwes_for_compression,
-                   num_radix_blocks, mem_ptr);
-}
-
-template <typename Torus>
-__global__ void extract(Torus *glwe_array_out, Torus const *array_in,
-                        uint32_t index, uint32_t log_modulus,
-                        uint32_t input_len, uint32_t initial_out_len) {
-  auto nbits = sizeof(Torus) * 8;
-
-  auto i = threadIdx.x + blockIdx.x * blockDim.x;
-  auto chunk_array_in = array_in + index * input_len;
-  if (i < initial_out_len) {
-    // Unpack
-    Torus mask = ((Torus)1 << log_modulus) - 1;
-    auto start = i * log_modulus;
-    auto end = (i + 1) * log_modulus;
-
-    auto start_block = start / nbits;
-    auto start_remainder = start % nbits;
-
-    auto end_block_inclusive = (end - 1) / nbits;
-
-    Torus unpacked_i;
-    if (start_block == end_block_inclusive) {
-      auto single_part = chunk_array_in[start_block] >> start_remainder;
-      unpacked_i = single_part & mask;
-    } else {
-      auto first_part = chunk_array_in[start_block] >> start_remainder;
-      auto second_part = chunk_array_in[start_block + 1]
-                         << (nbits - start_remainder);
-
-      unpacked_i = (first_part | second_part) & mask;
-    }
-
-    // Extract
-    glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
-  }
-}
-
-/// Extracts the glwe_index-nth GLWE ciphertext
-template <typename Torus>
-__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
-                           Torus *glwe_array_out, Torus const *array_in,
-                           uint32_t glwe_index,
-                           int_decompression<Torus> *mem_ptr) {
-  if (array_in == glwe_array_out)
-    PANIC("Cuda error: Input and output must be different");
-
-  cudaSetDevice(gpu_index);
-
-  auto compression_params = mem_ptr->compression_params;
-
-  auto log_modulus = mem_ptr->storage_log_modulus;
-
-  uint32_t body_count =
-      std::min(mem_ptr->body_count, compression_params.polynomial_size);
-  auto initial_out_len =
-      compression_params.glwe_dimension * compression_params.polynomial_size +
-      body_count;
-
-  auto compressed_glwe_accumulator_size =
-      (compression_params.glwe_dimension + 1) *
-      compression_params.polynomial_size;
-  auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
-  auto nbits = sizeof(Torus) * 8;
-  // number_bits_to_unpack.div_ceil(Scalar::BITS)
-  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
-
-  // We assure the tail of the glwe is zeroed
-  auto zeroed_slice = glwe_array_out + initial_out_len;
-  cuda_memset_async(zeroed_slice, 0,
-                    (compression_params.polynomial_size - body_count) *
-                        sizeof(Torus),
-                    stream, gpu_index);
-  int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
-  dim3 grid(num_blocks);
-  dim3 threads(num_threads);
-  extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, array_in,
-                                               glwe_index, log_modulus,
-                                               input_len, initial_out_len);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus>
-__host__ void host_integer_decompress(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in,
-    uint32_t const *h_indexes_array, uint32_t indexes_array_size,
-    void *const *d_bsks, int_decompression<Torus> *h_mem_ptr) {
-
-  auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
-  cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
-                           indexes_array_size * sizeof(uint32_t), streams[0],
-                           gpu_indexes[0]);
-
-  auto compression_params = h_mem_ptr->compression_params;
-  auto lwe_per_glwe = compression_params.polynomial_size;
-  if (indexes_array_size > lwe_per_glwe)
-    PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
-          "be smaller than "
-          "polynomial_size.")
-
-  auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
-  if (num_radix_blocks != indexes_array_size)
-    PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
-          "should be the same as indexes_array_size.")
-
-  // the first element is the last index in h_indexes_array that lies in the
-  // related GLWE
-  std::vector<std::pair<int, Torus *>> glwe_vec;
-
-  // Extract all GLWEs
-  Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                compression_params.polynomial_size;
-
-  auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
-  auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
-  host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
-                      d_packed_glwe_in, current_glwe_index, h_mem_ptr);
-  glwe_vec.push_back(std::make_pair(0, extracted_glwe));
-  for (int i = 1; i < indexes_array_size; i++) {
-    auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
-    if (glwe_index != current_glwe_index) {
-      extracted_glwe += glwe_accumulator_size;
-      current_glwe_index = glwe_index;
-      // Extracts a new GLWE
-      host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
-                          d_packed_glwe_in, glwe_index, h_mem_ptr);
-      glwe_vec.push_back(std::make_pair(i, extracted_glwe));
-    } else {
-      // Updates the index
-      glwe_vec.back().first++;
-    }
-  }
-  // Sample extract all LWEs
-  Torus lwe_accumulator_size = compression_params.small_lwe_dimension + 1;
-
-  auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
-  uint32_t current_idx = 0;
-  auto d_indexes_array_chunk = d_indexes_array;
-  for (const auto &max_idx_and_glwe : glwe_vec) {
-    uint32_t last_idx = max_idx_and_glwe.first;
-    extracted_glwe = max_idx_and_glwe.second;
-
-    auto num_lwes = last_idx + 1 - current_idx;
-    cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
-                                extracted_glwe, d_indexes_array_chunk, num_lwes,
-                                compression_params.glwe_dimension,
-                                compression_params.polynomial_size);
-    d_indexes_array_chunk += num_lwes;
-    extracted_lwe += num_lwes * lwe_accumulator_size;
-    current_idx = last_idx;
-  }
-
-  // Reset
-  extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
-
-  // In the case of extracting a single LWE these parameters are dummy
-  uint32_t lut_count = 1;
-  uint32_t lut_stride = 0;
-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  auto encryption_params = h_mem_ptr->encryption_params;
-  auto lut = h_mem_ptr->carry_extract_lut;
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  if (active_gpu_count == 1) {
-    execute_pbs_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
-        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
-        lut->lwe_indexes_in, d_bsks, lut->buffer,
-        encryption_params.glwe_dimension,
-        compression_params.small_lwe_dimension,
-        encryption_params.polynomial_size, encryption_params.pbs_base_log,
-        encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
-  } else {
-    /// For multi GPU execution we create vectors of pointers for inputs and
-    /// outputs
-    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
-    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
-    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
-
-    /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-    /// With multiple GPUs we push to the vectors on each GPU then when we
-    /// gather data to GPU 0 we can copy back to the original indexing
-    multi_gpu_scatter_lwe_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
-        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
-        compression_params.small_lwe_dimension + 1);
-
-    /// Apply PBS
-    execute_pbs_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
-        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
-        lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, lut->buffer,
-        encryption_params.glwe_dimension,
-        compression_params.small_lwe_dimension,
-        encryption_params.polynomial_size, encryption_params.pbs_base_log,
-        encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
-
-    /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
-        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-        lut->using_trivial_lwe_indexes, num_radix_blocks,
-        encryption_params.big_lwe_dimension + 1);
-
-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
-  }
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_compress_integer_radix_ciphertext(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_compression<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params compression_params,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_compression<Torus>(
-      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
-      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, uint32_t body_count,
-    int_radix_params encryption_params, int_radix_params compression_params,
-    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_decompression<Torus>(
-      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
-      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory);
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -1,8 +1,8 @@
 #include "integer/div_rem.cuh"

 void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -20,23 +20,20 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
 }

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
+    void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks) {

  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

  host_integer_div_rem_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-      static_cast<const uint64_t *>(numerator),
-      static_cast<const uint64_t *>(divisor), bsks, (uint64_t **)(ksks), mem,
-      num_blocks);
+      static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+      bsks, (uint64_t **)(ksks), mem, num_blocks);
 }

-void cleanup_cuda_integer_div_rem(void *const *streams,
-                                  uint32_t const *gpu_indexes,
+void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
                                  uint32_t gpu_count, int8_t **mem_ptr_void) {
  int_div_rem_memory<uint64_t> *mem_ptr =
      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -3,13 +3,13 @@

 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "integer.h"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
-#include "integer/integer_utilities.h"
 #include "integer/negation.cuh"
 #include "integer/scalar_shifts.cuh"
 #include "linear_algebra.h"
-#include "pbs/programmable_bootstrap.h"
+#include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
@@ -160,23 +160,21 @@ template <typename Torus> struct lwe_ciphertext_list {

 template <typename Torus>
 __host__ void scratch_cuda_integer_div_rem_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_div_rem_memory<Torus> **mem_ptr,
-    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_div_rem_memory<Torus> **mem_ptr, uint32_t num_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

  *mem_ptr = new int_div_rem_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
-__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, Torus *quotient,
-                                      Torus *remainder, Torus const *numerator,
-                                      Torus const *divisor, void *const *bsks,
-                                      uint64_t *const *ksks,
-                                      int_div_rem_memory<uint64_t> *mem_ptr,
-                                      uint32_t num_blocks) {
+__host__ void
+host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *quotient, Torus *remainder,
+                        Torus *numerator, Torus *divisor, void **bsks,
+                        uint64_t **ksks, int_div_rem_memory<uint64_t> *mem_ptr,
+                        uint32_t num_blocks) {

  auto radix_params = mem_ptr->params;

@@ -224,8 +222,8 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
  lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
      mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);

-  numerator_block_stack.clone_from((Torus *)numerator, 0, num_blocks - 1,
-                                   streams[0], gpu_indexes[0]);
+  numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
+                                   gpu_indexes[0]);
  remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
  remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);

@@ -247,9 +245,9 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
                                      streams[0], gpu_indexes[0]);
    interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
                                      streams[0], gpu_indexes[0]);
-    interesting_divisor.clone_from((Torus *)divisor, 0, last_non_trivial_block,
+    interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
                                   streams[0], gpu_indexes[0]);
-    divisor_ms_blocks.clone_from((Torus *)divisor,
+    divisor_ms_blocks.clone_from(divisor,
                                 (msb_bit_set + 1) / num_bits_in_message,
                                 num_blocks - 1, streams[0], gpu_indexes[0]);

@@ -258,67 +256,65 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // msb_bit_set) the split versions share some bits they should not. So we do
    // one PBS on the last block of the interesting_divisor, and first block of
    // divisor_ms_blocks to trim out bits which should not be there
-    auto trim_last_interesting_divisor_bits = [&](cudaStream_t const *streams,
-                                                  uint32_t const *gpu_indexes,
-                                                  uint32_t gpu_count) {
-      if ((msb_bit_set + 1) % num_bits_in_message == 0) {
-        return;
-      }
-      // The last block of the interesting part of the remainder
-      // can contain bits which we should not account for
-      // we have to zero them out.
+    auto trim_last_interesting_divisor_bits =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          if ((msb_bit_set + 1) % num_bits_in_message == 0) {
+            return;
+          }
+          // The last block of the interesting part of the remainder
+          // can contain bits which we should not account for
+          // we have to zero them out.

-      // Where the msb is set in the block
-      uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+          // Where the msb is set in the block
+          uint32_t pos_in_block = msb_bit_set % num_bits_in_message;

-      // e.g 2 bits in message:
-      // if pos_in_block is 0, then we want to keep only first bit (right
-      // shift
-      // mask by 1) if pos_in_block is 1, then we want to keep the two
-      // bits
-      // (right shift mask by 0)
-      uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
+          // e.g 2 bits in message:
+          // if pos_in_block is 0, then we want to keep only first bit (right
+          // shift
+          // mask by 1) if pos_in_block is 1, then we want to keep the two
+          // bits
+          // (right shift mask by 0)
+          uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);

-      // Create mask of 1s on the message part, 0s in the carries
-      uint32_t full_message_mask = message_modulus - 1;
+          // Create mask of 1s on the message part, 0s in the carries
+          uint32_t full_message_mask = message_modulus - 1;

-      // Shift the mask so that we will only keep bits we should
-      uint32_t shifted_mask = full_message_mask >> shift_amount;
+          // Shift the mask so that we will only keep bits we should
+          uint32_t shifted_mask = full_message_mask >> shift_amount;

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
-          interesting_divisor.last_block(), bsks, ksks, 1,
-          mem_ptr->masking_luts_1[shifted_mask]);
-    }; // trim_last_interesting_divisor_bits
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
+              interesting_divisor.last_block(), bsks, ksks, 1,
+              mem_ptr->masking_luts_1[shifted_mask]);
+        }; // trim_last_interesting_divisor_bits

-    auto trim_first_divisor_ms_bits = [&](cudaStream_t const *streams,
-                                          uint32_t const *gpu_indexes,
-                                          uint32_t gpu_count) {
-      if (divisor_ms_blocks.is_empty() ||
-          ((msb_bit_set + 1) % num_bits_in_message) == 0) {
-        return;
-      }
-      // Where the msb is set in the block
-      uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+    auto trim_first_divisor_ms_bits =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          if (divisor_ms_blocks.is_empty() ||
+              ((msb_bit_set + 1) % num_bits_in_message) == 0) {
+            return;
+          }
+          // Where the msb is set in the block
+          uint32_t pos_in_block = msb_bit_set % num_bits_in_message;

-      // e.g 2 bits in message:
-      // if pos_in_block is 0, then we want to discard the first bit (left
-      // shift mask by 1) if pos_in_block is 1, then we want to discard the
-      // two bits (left shift mask by 2) let shift_amount =
-      // num_bits_in_message - pos_in_block
-      uint32_t shift_amount = pos_in_block + 1;
-      uint32_t full_message_mask = message_modulus - 1;
-      uint32_t shifted_mask = full_message_mask << shift_amount;
+          // e.g 2 bits in message:
+          // if pos_in_block is 0, then we want to discard the first bit (left
+          // shift mask by 1) if pos_in_block is 1, then we want to discard the
+          // two bits (left shift mask by 2) let shift_amount =
+          // num_bits_in_message - pos_in_block
+          uint32_t shift_amount = pos_in_block + 1;
+          uint32_t full_message_mask = message_modulus - 1;
+          uint32_t shifted_mask = full_message_mask << shift_amount;

-      // Keep the mask within the range of message bits, so that
-      // the estimated degree of the output is < msg_modulus
-      shifted_mask = shifted_mask & full_message_mask;
+          // Keep the mask within the range of message bits, so that
+          // the estimated degree of the output is < msg_modulus
+          shifted_mask = shifted_mask & full_message_mask;

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
-          divisor_ms_blocks.first_block(), bsks, ksks, 1,
-          mem_ptr->masking_luts_2[shifted_mask]);
-    }; // trim_first_divisor_ms_bits
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
+              divisor_ms_blocks.first_block(), bsks, ksks, 1,
+              mem_ptr->masking_luts_2[shifted_mask]);
+        }; // trim_first_divisor_ms_bits

    // This does
    //  R := R << 1; R(0) := N(i)
@@ -329,50 +325,48 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // However, to keep the remainder clean (noise wise), what we do is that we
    // put the remainder block from which we need to extract the bit, as the LSB
    // of the Remainder, so that left shifting will pull the bit we need.
-    auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams,
-                                                 uint32_t const *gpu_indexes,
-                                                 uint32_t gpu_count) {
-      numerator_block_1.clone_from(
-          numerator_block_stack, numerator_block_stack.len - 1,
-          numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
-      numerator_block_stack.pop();
-      interesting_remainder1.insert(0, numerator_block_1.first_block(),
-                                    streams[0], gpu_indexes[0]);
+    auto left_shift_interesting_remainder1 =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          numerator_block_1.clone_from(
+              numerator_block_stack, numerator_block_stack.len - 1,
+              numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
+          numerator_block_stack.pop();
+          interesting_remainder1.insert(0, numerator_block_1.first_block(),
+                                        streams[0], gpu_indexes[0]);

-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
-          mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
+          host_integer_radix_logical_scalar_shift_kb_inplace(
+              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
+              mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

-      tmp_radix.clone_from(interesting_remainder1, 0,
-                           interesting_remainder1.len - 1, streams[0],
-                           gpu_indexes[0]);
+          tmp_radix.clone_from(interesting_remainder1, 0,
+                               interesting_remainder1.len - 1, streams[0],
+                               gpu_indexes[0]);

-      host_radix_blocks_rotate_left<Torus>(
-          streams, gpu_indexes, gpu_count, interesting_remainder1.data,
-          tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
+          host_radix_blocks_rotate_left(
+              streams, gpu_indexes, gpu_count, interesting_remainder1.data,
+              tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);

-      numerator_block_1.clone_from(
-          interesting_remainder1, interesting_remainder1.len - 1,
-          interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
+          numerator_block_1.clone_from(
+              interesting_remainder1, interesting_remainder1.len - 1,
+              interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);

-      interesting_remainder1.pop();
+          interesting_remainder1.pop();

-      if (pos_in_block != 0) {
-        // We have not yet extracted all the bits from this numerator
-        // so, we put it back on the front so that it gets taken next
-        // iteration
-        numerator_block_stack.push(numerator_block_1.first_block(), streams[0],
-                                   gpu_indexes[0]);
-      }
-    }; // left_shift_interesting_remainder1
+          if (pos_in_block != 0) {
+            // We have not yet extracted all the bits from this numerator
+            // so, we put it back on the front so that it gets taken next
+            // iteration
+            numerator_block_stack.push(numerator_block_1.first_block(),
+                                       streams[0], gpu_indexes[0]);
+          }
+        }; // left_shift_interesting_remainder1

-    auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams,
-                                                 uint32_t const *gpu_indexes,
-                                                 uint32_t gpu_count) {
-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
-          mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
-    }; // left_shift_interesting_remainder2
+    auto left_shift_interesting_remainder2 =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          host_integer_radix_logical_scalar_shift_kb_inplace(
+              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
+              mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
+        }; // left_shift_interesting_remainder2

    for (uint j = 0; j < gpu_count; j++) {
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
@@ -402,10 +396,10 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // but in that position, interesting_remainder2 always has a 0
    auto &merged_interesting_remainder = interesting_remainder1;

-    host_addition<Torus>(
-        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
-        merged_interesting_remainder.data, interesting_remainder2.data,
-        radix_params.big_lwe_dimension, merged_interesting_remainder.len);
+    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+                  merged_interesting_remainder.data,
+                  interesting_remainder2.data, radix_params.big_lwe_dimension,
+                  merged_interesting_remainder.len);

    // after create_clean_version_of_merged_remainder
    // `merged_interesting_remainder` will be reused as
@@ -422,8 +416,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // fills:
    //  `new_remainder` - radix ciphertext
    //  `subtraction_overflowed` - single ciphertext
-    auto do_overflowing_sub = [&](cudaStream_t const *streams,
-                                  uint32_t const *gpu_indexes,
+    auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t gpu_count) {
      host_integer_overflowing_sub_kb<Torus>(
          streams, gpu_indexes, gpu_count, new_remainder.data,
@@ -434,8 +427,8 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,

    // fills:
    //  `at_least_one_upper_block_is_non_zero` - single ciphertext
-    auto check_divisor_upper_blocks = [&](cudaStream_t const *streams,
-                                          uint32_t const *gpu_indexes,
+    auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
+                                          uint32_t *gpu_indexes,
                                          uint32_t gpu_count) {
      auto &trivial_blocks = divisor_ms_blocks;
      if (trivial_blocks.is_empty()) {
@@ -446,7 +439,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
        // We could call unchecked_scalar_ne
        // But we are in the special case where scalar == 0
        // So we can skip some stuff
-        host_compare_with_zero_equality<Torus>(
+        host_compare_with_zero_equality(
            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -454,7 +447,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
        tmp_1.len =
            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);

-        is_at_least_one_comparisons_block_true<Torus>(
+        is_at_least_one_comparisons_block_true(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
@@ -466,9 +459,8 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // fills:
    //  `cleaned_merged_interesting_remainder` - radix ciphertext
    auto create_clean_version_of_merged_remainder =
-        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
-            uint32_t gpu_count) {
-          integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_univariate_lookup_table_kb(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -494,10 +486,10 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
-                         subtraction_overflowed.data,
-                         at_least_one_upper_block_is_non_zero.data,
-                         radix_params.big_lwe_dimension, 1);
+    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
+                  subtraction_overflowed.data,
+                  at_least_one_upper_block_is_non_zero.data,
+                  radix_params.big_lwe_dimension, 1);

    int factor = (i) ? 3 : 2;
    int factor_lut_id = factor - 2;
@@ -506,8 +498,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
        streams[0], gpu_indexes[0]);

    auto conditionally_zero_out_merged_interesting_remainder =
-        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
-            uint32_t gpu_count) {
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
@@ -519,8 +510,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
        };

    auto conditionally_zero_out_merged_new_remainder =
-        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
-            uint32_t gpu_count) {
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, new_remainder.data,
              new_remainder.data, overflow_sum_radix.data, bsks, ksks,
@@ -528,8 +518,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
              mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
        };

-    auto set_quotient_bit = [&](cudaStream_t const *streams,
-                                uint32_t const *gpu_indexes,
+    auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
                                uint32_t gpu_count) {
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, did_not_overflow.data,
@@ -539,10 +528,10 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);

-      host_addition<Torus>(
-          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
-          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
-          radix_params.big_lwe_dimension, 1);
+      host_addition(streams[0], gpu_indexes[0],
+                    &quotient[block_of_bit * big_lwe_size],
+                    &quotient[block_of_bit * big_lwe_size],
+                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
    };

    for (uint j = 0; j < gpu_count; j++) {
@@ -575,17 +564,17 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,

  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
-  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
-                       remainder2.data, radix_params.big_lwe_dimension,
-                       remainder1.len);
+  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                remainder2.data, radix_params.big_lwe_dimension,
+                remainder1.len);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, num_blocks, mem_ptr->message_extract_lut_2);
  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,11 +1,10 @@
 #include "integer/integer.cuh"
 #include <linear_algebra.h>

-void cuda_full_propagation_64_inplace(void *const *streams,
-                                      uint32_t const *gpu_indexes,
+void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
                                      uint32_t gpu_count, void *input_blocks,
-                                      int8_t *mem_ptr, void *const *ksks,
-                                      void *const *bsks, uint32_t num_blocks) {
+                                      int8_t *mem_ptr, void **ksks, void **bsks,
+                                      uint32_t num_blocks) {

  int_fullprop_buffer<uint64_t> *buffer =
      (int_fullprop_buffer<uint64_t> *)mem_ptr;
@@ -17,12 +16,11 @@ void cuda_full_propagation_64_inplace(void *const *streams,
 }

 void scratch_cuda_full_propagation_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -33,8 +31,7 @@ void scratch_cuda_full_propagation_64(
      (int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
 }

-void cleanup_cuda_full_propagation(void *const *streams,
-                                   uint32_t const *gpu_indexes,
+void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
                                   uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_fullprop_buffer<uint64_t> *mem_ptr =
@@ -44,8 +41,8 @@ void cleanup_cuda_full_propagation(void *const *streams,
 }

 void scratch_cuda_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
@@ -56,16 +53,16 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
+  scratch_cuda_propagate_single_carry_kb_inplace(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

 void cuda_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {
  host_propagate_single_carry<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
@@ -74,9 +71,9 @@ void cuda_propagate_single_carry_kb_64_inplace(
 }

 void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
+    void **ksks, uint32_t num_blocks) {
  host_propagate_single_carry<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
@@ -85,8 +82,7 @@ void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
      num_blocks);
 }

-void cleanup_cuda_propagate_single_carry(void *const *streams,
-                                         uint32_t const *gpu_indexes,
+void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void) {
  int_sc_prop_memory<uint64_t> *mem_ptr =
@@ -95,13 +91,12 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
 }

 void scratch_cuda_apply_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -110,56 +105,39 @@ void scratch_cuda_apply_univariate_lut_kb_64(

  scratch_cuda_apply_univariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_radix_lut<uint64_t> **)mem_ptr,
-      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      allocate_gpu_memory);
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
 }

-void cuda_apply_univariate_lut_kb_64(void *const *streams,
-                                     uint32_t const *gpu_indexes,
+void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
                                     uint32_t gpu_count, void *output_radix_lwe,
-                                     void const *input_radix_lwe,
-                                     int8_t *mem_ptr, void *const *ksks,
-                                     void *const *bsks, uint32_t num_blocks) {
+                                     void *input_radix_lwe, int8_t *mem_ptr,
+                                     void **ksks, void **bsks,
+                                     uint32_t num_blocks) {

  host_apply_univariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<const uint64_t *>(input_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe),
      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
      num_blocks);
 }

-void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
-                                             uint32_t const *gpu_indexes,
+void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
+                                             uint32_t *gpu_indexes,
                                             uint32_t gpu_count,
                                             int8_t **mem_ptr_void) {
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

-void cuda_apply_many_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks,
-    uint32_t lut_count, uint32_t lut_stride) {
-
-  host_apply_many_univariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<const uint64_t *>(input_radix_lwe),
-      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
-      lut_count, lut_stride);
-}
-
 void scratch_cuda_apply_bivariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -172,23 +150,24 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
      num_radix_blocks, params, allocate_gpu_memory);
 }

-void cuda_apply_bivariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void const *input_radix_lwe_1,
-    void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
-    void *const *bsks, uint32_t num_blocks, uint32_t shift) {
+void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
+                                    uint32_t gpu_count, void *output_radix_lwe,
+                                    void *input_radix_lwe_1,
+                                    void *input_radix_lwe_2, int8_t *mem_ptr,
+                                    void **ksks, void **bsks,
+                                    uint32_t num_blocks, uint32_t shift) {

  host_apply_bivariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<const uint64_t *>(input_radix_lwe_1),
-      static_cast<const uint64_t *>(input_radix_lwe_2),
+      static_cast<uint64_t *>(input_radix_lwe_1),
+      static_cast<uint64_t *>(input_radix_lwe_2),
      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
      shift);
 }

-void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
-                                            uint32_t const *gpu_indexes,
+void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
+                                            uint32_t *gpu_indexes,
                                            uint32_t gpu_count,
                                            int8_t **mem_ptr_void) {
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
@@ -196,13 +175,12 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
 }

 void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -211,35 +189,34 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

  scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_radix_lut<uint64_t> **)mem_ptr,
-      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      allocate_gpu_memory);
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
 }

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t shift) {

  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

  host_compute_prefix_sum_hillis_steele<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<uint64_t *>(generates_or_propagates), params,
+      static_cast<uint64_t *>(input_radix_lwe), params,
      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      num_blocks);
 }

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void) {
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

-void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
-                                            uint32_t const *gpu_indexes,
+void cuda_integer_reverse_blocks_64_inplace(void **streams,
+                                            uint32_t *gpu_indexes,
                                            uint32_t gpu_count, void *lwe_array,
                                            uint32_t num_blocks,
                                            uint32_t lwe_size) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -4,12 +4,12 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "helper_multi_gpu.h"
-#include "integer/integer_utilities.h"
+#include "integer.h"
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
-#include "pbs/programmable_bootstrap.h"
 #include "polynomial/functions.cuh"
+#include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
@@ -69,16 +69,16 @@ __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
 // one block is responsible to process single lwe ciphertext
 template <typename Torus>
 __host__ void
-host_radix_blocks_rotate_right(cudaStream_t const *streams,
-                               uint32_t const *gpu_indexes, uint32_t gpu_count,
-                               Torus *dst, Torus *src, uint32_t value,
-                               uint32_t blocks_count, uint32_t lwe_size) {
+host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, Torus *dst, Torus *src,
+                               uint32_t value, uint32_t blocks_count,
+                               uint32_t lwe_size) {
  if (src == dst) {
    PANIC("Cuda error (blocks_rotate_right): the source and destination "
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -86,16 +86,16 @@ host_radix_blocks_rotate_right(cudaStream_t const *streams,
 // calculation is not inplace, so `dst` and `src` must not be the same
 template <typename Torus>
 __host__ void
-host_radix_blocks_rotate_left(cudaStream_t const *streams,
-                              uint32_t const *gpu_indexes, uint32_t gpu_count,
-                              Torus *dst, Torus *src, uint32_t value,
-                              uint32_t blocks_count, uint32_t lwe_size) {
+host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
+                              uint32_t gpu_count, Torus *dst, Torus *src,
+                              uint32_t value, uint32_t blocks_count,
+                              uint32_t lwe_size) {
  if (src == dst) {
    PANIC("Cuda error (blocks_rotate_left): the source and destination "
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -119,23 +119,22 @@ __global__ void radix_blocks_reverse_lwe_inplace(Torus *src,

 template <typename Torus>
 __host__ void
-host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
-                                  uint32_t const *gpu_indexes, Torus *src,
-                                  uint32_t blocks_count, uint32_t lwe_size) {
+host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  Torus *src, uint32_t blocks_count,
+                                  uint32_t lwe_size) {
  cudaSetDevice(gpu_indexes[0]);
  int num_blocks = blocks_count / 2, num_threads = 1024;
-  radix_blocks_reverse_lwe_inplace<Torus>
-      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
+  radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
+      src, blocks_count, lwe_size);
 }

 // polynomial_size threads
 template <typename Torus>
 __global__ void
-device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out,
-                             Torus const *lwe_array_1, Torus const *lwe_array_2,
-                             Torus const *lwe_indexes_in,
-                             uint32_t lwe_dimension, uint32_t shift,
-                             uint32_t num_blocks) {
+device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
+                             Torus *lwe_array_1, Torus *lwe_array_2,
+                             Torus *lwe_indexes_in, uint32_t lwe_dimension,
+                             uint32_t shift, uint32_t num_blocks) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if (tid < num_blocks * (lwe_dimension + 1)) {
@@ -152,32 +151,30 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out,
 *  becomes out = m1 * shift + m2
 */
 template <typename Torus>
-__host__ void
-pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                      uint32_t gpu_count, Torus *lwe_array_out,
-                      Torus const *lwe_indexes_out, Torus const *lwe_array_1,
-                      Torus const *lwe_array_2, Torus const *lwe_indexes_in,
-                      uint32_t lwe_dimension, uint32_t shift,
-                      uint32_t num_radix_blocks) {
+__host__ void pack_bivariate_blocks(cudaStream_t *streams,
+                                    uint32_t *gpu_indexes, uint32_t gpu_count,
+                                    Torus *lwe_array_out,
+                                    Torus *lwe_indexes_out, Torus *lwe_array_1,
+                                    Torus *lwe_array_2, Torus *lwe_indexes_in,
+                                    uint32_t lwe_dimension, uint32_t shift,
+                                    uint32_t num_radix_blocks) {

  cudaSetDevice(gpu_indexes[0]);
  // Left message is shifted
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<Torus>
-      <<<num_blocks, num_threads, 0, streams[0]>>>(
-          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
-          lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
+  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
+      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
+      lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }

 template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
-    int_radix_lut<Torus> *lut) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
  // apply_lookup_table
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
@@ -191,9 +188,6 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

-  // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
-  uint32_t lut_stride = 0;
  /// For multi GPU execution we create vectors of pointers for inputs and
  /// outputs
  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
@@ -204,10 +198,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
  if (active_gpu_count == 1) {
    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
-                                   lwe_trivial_indexes_vec[0],
-                                   (Torus *)lwe_array_in, lut->lwe_indexes_in,
-                                   ksks, big_lwe_dimension, small_lwe_dimension,
-                                   ks_base_log, ks_level, num_radix_blocks);
+                                   lwe_trivial_indexes_vec[0], lwe_array_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
@@ -216,7 +210,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+        grouping_factor, num_radix_blocks, pbs_type);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -242,92 +236,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
-        lut_stride);
-
-    /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                      lwe_array_out, lwe_after_pbs_vec,
-                                      lut->h_lwe_indexes_out,
-                                      lut->using_trivial_lwe_indexes,
-                                      num_radix_blocks, big_lwe_dimension + 1);
-
-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
-  }
-}
-
-template <typename Torus>
-__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
-    int_radix_lut<Torus> *lut, uint32_t lut_count, uint32_t lut_stride) {
-  // apply_lookup_table
-  auto params = lut->params;
-  auto pbs_type = params.pbs_type;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto small_lwe_dimension = params.small_lwe_dimension;
-  auto ks_level = params.ks_level;
-  auto ks_base_log = params.ks_base_log;
-  auto pbs_level = params.pbs_level;
-  auto pbs_base_log = params.pbs_base_log;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto grouping_factor = params.grouping_factor;
-
-  /// For multi GPU execution we create vectors of pointers for inputs and
-  /// outputs
-  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
-  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
-  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
-  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
-
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  if (active_gpu_count == 1) {
-    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
-                                   lwe_trivial_indexes_vec[0],
-                                   (Torus *)lwe_array_in, lut->lwe_indexes_in,
-                                   ksks, big_lwe_dimension, small_lwe_dimension,
-                                   ks_base_log, ks_level, num_radix_blocks);
-
-    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
-        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
-        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
-        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
-  } else {
-    /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-    /// With multiple GPUs we push to the vectors on each GPU then when we
-    /// gather data to GPU 0 we can copy back to the original indexing
-    multi_gpu_scatter_lwe_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
-        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
-        big_lwe_dimension + 1);
-
-    /// Apply KS to go from a big LWE dimension to a small LWE dimension
-    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
-                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
-                                   ksks, big_lwe_dimension, small_lwe_dimension,
-                                   ks_base_log, ks_level, num_radix_blocks);
-
-    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
-        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
-        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
-        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
-        lut_stride);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -345,10 +254,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(

 template <typename Torus>
 __host__ void integer_radix_apply_bivariate_lookup_table_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
-    Torus const *lwe_array_2, void *const *bsks, Torus *const *ksks,
-    uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t shift) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
+    uint32_t shift) {

  auto params = lut->params;
  auto pbs_type = params.pbs_type;
@@ -362,16 +271,12 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

-  // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
-  uint32_t lut_stride = 0;
-
  // Left message is shifted
  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
-                               lwe_array_pbs_in, lut->lwe_trivial_indexes,
-                               lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
-                               big_lwe_dimension, shift, num_radix_blocks);
+  pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
+                        lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
+                        lut->lwe_indexes_in, big_lwe_dimension, shift,
+                        num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -396,7 +301,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+        grouping_factor, num_radix_blocks, pbs_type);
  } else {
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    multi_gpu_scatter_lwe_async<Torus>(
@@ -418,8 +323,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
-        lut_stride);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -476,7 +380,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
    body[i] = -body[i];
  }

-  rotate_left<Torus>(body, half_box_size, polynomial_size);
+  rotate_left(body, half_box_size, polynomial_size);
 }

 template <typename Torus>
@@ -538,6 +442,7 @@ void generate_device_accumulator_bivariate(
                                         message_modulus, carry_modulus, f);

  // copy host lut and lut_indexes_vec to device
+  cuda_synchronize_stream(stream, gpu_index);
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
@@ -603,6 +508,7 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

+  cuda_synchronize_stream(stream, gpu_index);
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -614,10 +520,9 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,

 template <typename Torus>
 void scratch_cuda_propagate_single_carry_kb_inplace(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_sc_prop_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

  *mem_ptr =
      new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
@@ -626,10 +531,10 @@ void scratch_cuda_propagate_single_carry_kb_inplace(

 template <typename Torus>
 void host_compute_prefix_sum_hillis_steele(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *step_output, Torus *generates_or_propagates,
-    int_radix_params params, int_radix_lut<Torus> *luts, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
+    int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
+    uint32_t num_blocks) {

  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -662,17 +567,12 @@ void host_compute_prefix_sum_hillis_steele(
 }

 template <typename Torus>
-void host_propagate_single_carry(cudaStream_t const *streams,
-                                 uint32_t const *gpu_indexes,
+void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 uint32_t gpu_count, Torus *lwe_array,
                                 Torus *carry_out, Torus *input_carries,
-                                 int_sc_prop_memory<Torus> *mem,
-                                 void *const *bsks, Torus *const *ksks,
-                                 uint32_t num_blocks) {
+                                 int_sc_prop_memory<Torus> *mem, void **bsks,
+                                 Torus **ksks, uint32_t num_blocks) {
  auto params = mem->params;
-  if (params.message_modulus == 2)
-    PANIC("Cuda error: single carry propagation is not supported for 1 bit "
-          "messages")
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
@@ -690,13 +590,13 @@ void host_propagate_single_carry(cudaStream_t const *streams,
      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
-  host_compute_prefix_sum_hillis_steele<Torus>(
+  host_compute_prefix_sum_hillis_steele(
      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

-  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
-                                        step_output, generates_or_propagates, 1,
-                                        num_blocks, big_lwe_size);
+  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
+                                 generates_or_propagates, 1, num_blocks,
+                                 big_lwe_size);
  if (carry_out != nullptr) {
    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
                                 streams[0], gpu_indexes[0]);
@@ -705,14 +605,13 @@ void host_propagate_single_carry(cudaStream_t const *streams,
                    gpu_indexes[0]);

  if (input_carries != nullptr) {
-    cuda_memcpy_async_gpu_to_gpu((void *)input_carries, step_output,
+    cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
                                 big_lwe_size_bytes * num_blocks, streams[0],
                                 gpu_indexes[0]);
  }

-  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                       step_output, glwe_dimension * polynomial_size,
-                       num_blocks);
+  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
+                glwe_dimension * polynomial_size, num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -721,10 +620,10 @@ void host_propagate_single_carry(cudaStream_t const *streams,

 template <typename Torus>
 void host_generate_last_block_inner_propagation(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *last_block_inner_propagation, Torus const *lhs,
-    Torus const *rhs, int_last_block_inner_propagate_memory<Torus> *mem,
-    void *const *bsks, Torus *const *ksks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
+    int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
+    Torus **ksks) {

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
@@ -733,12 +632,11 @@ void host_generate_last_block_inner_propagation(
 }

 template <typename Torus>
-void host_propagate_single_sub_borrow(cudaStream_t const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, Torus *overflowed,
-                                      Torus *lwe_array,
+void host_propagate_single_sub_borrow(cudaStream_t *streams,
+                                      uint32_t *gpu_indexes, uint32_t gpu_count,
+                                      Torus *overflowed, Torus *lwe_array,
                                      int_overflowing_sub_memory<Torus> *mem,
-                                      void *const *bsks, Torus *const *ksks,
+                                      void **bsks, Torus **ksks,
                                      uint32_t num_blocks) {
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
@@ -766,15 +664,14 @@ void host_propagate_single_sub_borrow(cudaStream_t const *streams,
      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
      big_lwe_size_bytes, streams[0], gpu_indexes[0]);

-  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
-                                        step_output, generates_or_propagates, 1,
-                                        num_blocks, big_lwe_size);
+  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
+                                 generates_or_propagates, 1, num_blocks,
+                                 big_lwe_size);
  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                    gpu_indexes[0]);

-  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                          step_output, glwe_dimension * polynomial_size,
-                          num_blocks);
+  host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                   step_output, glwe_dimension * polynomial_size, num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -790,20 +687,16 @@ void host_propagate_single_sub_borrow(cudaStream_t const *streams,
 * have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
 */
 template <typename Torus>
-void host_full_propagate_inplace(cudaStream_t const *streams,
-                                 uint32_t const *gpu_indexes,
+void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 uint32_t gpu_count, Torus *input_blocks,
                                 int_fullprop_buffer<Torus> *mem_ptr,
-                                 Torus *const *ksks, void *const *bsks,
+                                 Torus **ksks, void **bsks,
                                 uint32_t num_blocks) {
  auto params = mem_ptr->lut->params;

  int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
  int small_lwe_size = (params.small_lwe_dimension + 1);

-  // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
-  uint32_t lut_stride = 0;
  for (int i = 0; i < num_blocks; i++) {
    auto cur_input_block = &input_blocks[i * big_lwe_size];

@@ -826,25 +719,24 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
        params.glwe_dimension, params.small_lwe_dimension,
        params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);
+        params.grouping_factor, 2, params.pbs_type);

-    cuda_memcpy_async_gpu_to_gpu(
-        (void *)cur_input_block, mem_ptr->tmp_big_lwe_vector,
-        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
+                                 big_lwe_size * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);

    if (i < num_blocks - 1) {
      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
-                           (Torus const *)next_input_block,
-                           &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                           params.big_lwe_dimension, 1);
+      host_addition(streams[0], gpu_indexes[0], next_input_block,
+                    next_input_block,
+                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                    params.big_lwe_dimension, 1);
    }
  }
 }

 template <typename Torus>
-void scratch_cuda_full_propagation(cudaStream_t const *streams,
-                                   uint32_t const *gpu_indexes,
+void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
                                   uint32_t gpu_count,
                                   int_fullprop_buffer<Torus> **mem_ptr,
                                   int_radix_params params,
@@ -857,16 +749,14 @@ void scratch_cuda_full_propagation(cudaStream_t const *streams,
 // (lwe_dimension+1) threads
 // (num_radix_blocks / 2) thread blocks
 template <typename Torus>
-__global__ void device_pack_blocks(Torus *lwe_array_out,
-                                   Torus const *lwe_array_in,
+__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
                                   uint32_t lwe_dimension,
                                   uint32_t num_radix_blocks, uint32_t factor) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if (tid < (lwe_dimension + 1)) {
    for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
-      Torus *lsb_block =
-          (Torus *)lwe_array_in + (2 * bid) * (lwe_dimension + 1);
+      Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
      Torus *msb_block = lsb_block + (lwe_dimension + 1);

      Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
@@ -875,9 +765,9 @@ __global__ void device_pack_blocks(Torus *lwe_array_out,
    }

    if (num_radix_blocks % 2 == 1) {
-      // We couldn't host_pack the last block, so we just copy it
+      // We couldn't pack the last block, so we just copy it
      Torus *lsb_block =
-          (Torus *)lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
+          lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
      Torus *last_block =
          lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);

@@ -895,7 +785,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out,
 // Expects the carry buffer to be empty
 template <typename Torus>
 __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
-                          Torus *lwe_array_out, Torus const *lwe_array_in,
+                          Torus *lwe_array_out, Torus *lwe_array_in,
                          uint32_t lwe_dimension, uint32_t num_radix_blocks,
                          uint32_t factor) {
  if (num_radix_blocks == 0)
@@ -904,13 +794,13 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
-  device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
+  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }

 template <typename Torus>
 __global__ void
-device_create_trivial_radix(Torus *lwe_array, Torus const *scalar_input,
+device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
                            int32_t num_blocks, uint32_t lwe_dimension,
                            uint64_t delta) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -925,7 +815,7 @@ device_create_trivial_radix(Torus *lwe_array, Torus const *scalar_input,
 template <typename Torus>
 __host__ void
 create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
-                     Torus *lwe_array_out, Torus const *scalar_array,
+                     Torus *lwe_array_out, Torus *scalar_array,
                     uint32_t lwe_dimension, uint32_t num_radix_blocks,
                     uint32_t num_scalar_blocks, uint64_t message_modulus,
                     uint64_t carry_modulus) {
@@ -950,7 +840,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
+  device_create_trivial_radix<<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }
@@ -961,26 +851,26 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
 * * (lwe_dimension+1) * sizeeof(Torus) bytes
 */
 template <typename Torus>
-__host__ void extract_n_bits(cudaStream_t const *streams,
-                             uint32_t const *gpu_indexes, uint32_t gpu_count,
-                             Torus *lwe_array_out, Torus *lwe_array_in,
-                             void *const *bsks, Torus *const *ksks,
+__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count, Torus *lwe_array_out,
+                             Torus *lwe_array_in, void **bsks, Torus **ksks,
                             uint32_t num_radix_blocks, uint32_t bits_per_block,
                             int_bit_extract_luts_buffer<Torus> *bit_extract) {

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
      num_radix_blocks * bits_per_block, bit_extract->lut);
 }

 template <typename Torus>
 __host__ void
-reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-             uint32_t gpu_count, Torus *signs_array_out, Torus *signs_array_in,
+reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+             Torus *signs_array_out, Torus *signs_array_in,
             int_comparison_buffer<Torus> *mem_ptr,
-             std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
-             Torus *const *ksks, uint32_t num_sign_blocks) {
+             std::function<Torus(Torus)> sign_handler_f, void **bsks,
+             Torus **ksks, uint32_t num_sign_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -1014,9 +904,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
-      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
-                         big_lwe_dimension, num_sign_blocks, 4);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
+                  big_lwe_dimension, num_sign_blocks, 4);
+      integer_radix_apply_univariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          num_sign_blocks / 2, lut);

@@ -1047,11 +937,11 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
-                       big_lwe_dimension, 2, 4);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
-        1, lut);
+    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
+                2, 4);
+    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                   gpu_count, signs_array_out,
+                                                   signs_b, bsks, ksks, 1, lut);

  } else {

@@ -1067,79 +957,65 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
-        1, lut);
+    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                   gpu_count, signs_array_out,
+                                                   signs_a, bsks, ksks, 1, lut);
  }
 }

 template <typename Torus>
 void scratch_cuda_apply_univariate_lut_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
-  cuda_memcpy_async_to_gpu(
-      (*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
-      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
-      streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
+                           (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(Torus),
+                           streams[0], gpu_indexes[0]);
  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 }

 template <typename Torus>
-void host_apply_univariate_lut_kb(cudaStream_t const *streams,
-                                  uint32_t const *gpu_indexes,
+void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t gpu_count, Torus *radix_lwe_out,
-                                  Torus const *radix_lwe_in,
-                                  int_radix_lut<Torus> *mem, Torus *const *ksks,
-                                  void *const *bsks, uint32_t num_blocks) {
+                                  Torus *radix_lwe_in,
+                                  int_radix_lut<Torus> *mem, Torus **ksks,
+                                  void **bsks, uint32_t num_blocks) {

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
      num_blocks, mem);
 }

-template <typename Torus>
-void host_apply_many_univariate_lut_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in,
-    int_radix_lut<Torus> *mem, Torus *const *ksks, void *const *bsks,
-    uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
-
-  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
-      num_blocks, mem, lut_count, lut_stride);
-}
-
 template <typename Torus>
 void scratch_cuda_apply_bivariate_lut_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
-  cuda_memcpy_async_to_gpu(
-      (*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
-      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
-      streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
+                           (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(Torus),
+                           streams[0], gpu_indexes[0]);
  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 }

 template <typename Torus>
-void host_apply_bivariate_lut_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in_1,
-    Torus const *radix_lwe_in_2, int_radix_lut<Torus> *mem, Torus *const *ksks,
-    void *const *bsks, uint32_t num_blocks, uint32_t shift) {
+void host_apply_bivariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, Torus *radix_lwe_out,
+                                 Torus *radix_lwe_in_1, Torus *radix_lwe_in_2,
+                                 int_radix_lut<Torus> *mem, Torus **ksks,
+                                 void **bsks, uint32_t num_blocks,
+                                 uint32_t shift) {

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in_1,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
 * the integer radix multiplication in keyswitch->bootstrap order.
 */
 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          polynomial_size * glwe_dimension, lwe_dimension,
@@ -87,7 +87,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
  case 8192:
  case 16384:
    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
-        (cudaStream_t const *)(streams), gpu_indexes, gpu_count,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
        allocate_gpu_memory);
    break;
@@ -125,67 +125,67 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 * - 'pbs_type' selects which PBS implementation should be used
 */
 void cuda_integer_mult_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left,
-    void const *radix_lwe_right, void *const *bsks, void *const *ksks,
-    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
+    void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks) {

  switch (polynomial_size) {
  case 256:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 512:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 1024:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 4096:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 8192:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 16384:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
-        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -193,9 +193,8 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
  }
 }

-void cleanup_cuda_integer_mult(void *const *streams,
-                               uint32_t const *gpu_indexes, uint32_t gpu_count,
-                               int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_mul_memory<uint64_t> *mem_ptr =
      (int_mul_memory<uint64_t> *)(*mem_ptr_void);
@@ -204,10 +203,10 @@ void cleanup_cuda_integer_mult(void *const *streams,
 }

 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory) {
@@ -223,10 +222,9 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 }

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix) {
+    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;

@@ -243,8 +241,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
-        nullptr);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -252,8 +249,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
-        nullptr);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -261,8 +257,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
-        nullptr);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -270,8 +265,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
-        nullptr);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -279,8 +273,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
-        nullptr);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -288,8 +281,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
-        nullptr);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -300,7 +292,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 }

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void) {
  int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -9,10 +9,10 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "helper_multi_gpu.h"
+#include "integer.h"
 #include "integer/integer.cuh"
-#include "integer/integer_utilities.h"
 #include "linear_algebra.h"
-#include "pbs/programmable_bootstrap.h"
+#include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
@@ -43,8 +43,8 @@ __global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,

 template <typename Torus, class params>
 __global__ void
-all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
-                    Torus *msb_ciphertext, Torus const *radix_lwe_right,
+all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
+                    Torus *msb_ciphertext, Torus *radix_lwe_right,
                    Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {

  size_t block_id = blockIdx.x;
@@ -170,8 +170,8 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
 }
 template <typename Torus>
 __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    int_radix_params params, bool allocate_gpu_memory) {

@@ -182,15 +182,13 @@ __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(

 template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *radix_lwe_out, Torus *terms, int *terms_degree,
-    void *const *bsks, uint64_t *const *ksks,
-    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
+    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
-    int_radix_lut<Torus> *reused_lut) {
+    int_radix_lut<Torus> *reused_lut = nullptr) {

  auto new_blocks = mem_ptr->new_blocks;
-  auto new_blocks_copy = mem_ptr->new_blocks_copy;
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

@@ -207,31 +205,12 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
  auto small_lwe_size = small_lwe_dimension + 1;

-  // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
-  uint32_t lut_stride = 0;
-
-  if (num_radix_in_vec == 0)
-    return;
-  if (num_radix_in_vec == 1) {
-    cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
-                                 num_blocks_in_radix * big_lwe_size *
-                                     sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
-    return;
-  }
  if (old_blocks != terms) {
    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
                                 num_blocks_in_radix * num_radix_in_vec *
                                     big_lwe_size * sizeof(Torus),
                                 streams[0], gpu_indexes[0]);
  }
-  if (num_radix_in_vec == 2) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                         &old_blocks[num_blocks * big_lwe_size],
-                         big_lwe_dimension, num_blocks);
-    return;
-  }

  size_t r = num_radix_in_vec;
  size_t total_modulus = message_modulus * carry_modulus;
@@ -292,6 +271,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    if (!ch_amount)
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);
+    size_t sm_size = big_lwe_size * sizeof(Torus);

    cudaSetDevice(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
@@ -308,6 +288,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
@@ -322,11 +303,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    // inside d_smart_copy_in there are only -1 values
    // it's fine to call smart_copy with same pointer
    // as source and destination
-    cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
-                                 r * num_blocks * big_lwe_size * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
-    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
-        new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
+    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
+        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
    check_cuda_error(cudaGetLastError());

@@ -369,7 +347,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, lut_count, lut_stride);
+          mem_ptr->params.pbs_type);
    } else {
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);

@@ -417,7 +395,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, lut_count, lut_stride);
+          mem_ptr->params.pbs_type);

      multi_gpu_gather_lwe_async<Torus>(
          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -444,16 +422,16 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  luts_message_carry->release(streams, gpu_indexes, gpu_count);
  delete (luts_message_carry);

-  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                       &old_blocks[num_blocks * big_lwe_size],
-                       big_lwe_dimension, num_blocks);
+  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
+                num_blocks);
 }

 template <typename Torus, class params>
 __host__ void host_integer_mult_radix_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
-    uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
+    uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks,
    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

  auto glwe_dimension = mem_ptr->params.glwe_dimension;
@@ -570,10 +548,9 @@ __host__ void host_integer_mult_radix_kb(

 template <typename Torus>
 __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
  *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
                                       num_radix_blocks, allocate_gpu_memory);
 }
--- a/Show More
+++ b/Show More