refactor(gpu): mono-kernel TBC

chore(ci): update node to 22.6
chore(ci): reduce bench loops for WASM compressed server key
2026-04-28 03:01:21 -04:00 · 2024-08-16 15:45:09 +00:00 · 2024-08-14 13:42:14 +02:00 · 2024-08-14 13:42:14 +02:00 · 2024-08-14 13:42:14 +02:00 · 2024-08-14 13:42:14 +02:00
445 changed files with 18558 additions and 8903 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -4,6 +4,7 @@ self-hosted-runner:
    - m1mac
    - 4090-desktop
    - large_windows_16_latest
+    - large_ubuntu_16
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -0,0 +1,120 @@
+# Run backward compatibility tests
+name: Backward compatibility Tests on CPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  setup-instance:
+    name: Setup instance (backward-compat-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  backward-compat-tests:
+    name: Backward compatibility tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
+      - name: Install git-lfs
+        run: |
+          sudo apt update && sudo apt -y install git-lfs
+
+      - name: Use specific data branch
+        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
+        env:
+          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+        run: |
+          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
+
+      - name: Get backward compat branch
+        id: backward_compat_branch
+        run: |
+          BRANCH="$(make backward_compat_branch)"
+          echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
+
+      - name: Clone test data
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'
+          repository: zama-ai/tfhe-backward-compat-data
+          path: tfhe/tfhe-backward-compat-data
+          lfs: 'true'
+          ref: ${{ steps.backward_compat_branch.outputs.branch }}
+
+      - name: Run backward compatibility tests
+        run: |
+          make test_backward_compatibility_ci
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (backward-compat-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, backward-compat-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -1,4 +1,4 @@
-# Run a small subset of shortint and integer tests to ensure quick feedback.
+# Run a small subset of tests to ensure quick feedback.
 name: Fast AWS Tests on CPU

 env:
@@ -11,6 +11,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -18,15 +19,112 @@ on:
  pull_request:

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.core_crypto_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      boolean_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.boolean_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      integer_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.integer_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      wasm_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.wasm_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.high_level_api_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.user_docs_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            dependencies:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+            csprng:
+              - concrete-csprng/**
+            zk_pok:
+              - tfhe-zk-pok/**
+            core_crypto:
+              - tfhe/src/core_crypto/**
+            boolean:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/boolean/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+            integer:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+            wasm:
+              - tfhe/src/**
+              - tfhe/js_on_wasm_tests/**
+              - tfhe/web_wasm_parallel_tests/**
+              - '!tfhe/src/c_api/**'
+              - '!tfhe/src/boolean/**'
+            high_level_api:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - '!tfhe/src/boolean/**'
+              - '!tfhe/src/c_api/**'
+              - '!tfhe/src/js_on_wasm_api/**'
+            user_docs:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - 'tfhe/docs/**.md'
+              - README.md
+
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.boolean_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' ||
+          steps.changed-files.outputs.integer_any_changed == 'true' ||
+          steps.changed-files.outputs.wasm_any_changed == 'true' ||
+          steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
+          steps.changed-files.outputs.user_docs_any_changed == 'true')
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
+
  setup-instance:
    name: Setup instance (fast-tests)
+    if: github.event_name != 'pull_request' ||
+      needs.should-run.outputs.any_file_changed == 'true'
+    needs: should-run
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -37,7 +135,9 @@ jobs:

  fast-tests:
    name: Fast CPU tests
-    needs: setup-instance
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    needs: [ should-run, setup-instance ]
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
@@ -53,59 +153,58 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

-      - name: Install git-lfs
-        run: |
-          sudo apt update && sudo apt -y install git-lfs
-
      - name: Run concrete-csprng tests
+        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
          make test_concrete_csprng

      - name: Run tfhe-zk-pok tests
+        if: needs.should-run.outputs.zk_pok_test == 'true'
        run: |
          make test_zk_pok

      - name: Run core tests
+        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
          AVX512_SUPPORT=ON make test_core_crypto

      - name: Run boolean tests
+        if: needs.should-run.outputs.boolean_test == 'true'
        run: |
          make test_boolean

      - name: Run user docs tests
+        if: needs.should-run.outputs.user_docs_test == 'true'
        run: |
          make test_user_doc

      - name: Run js on wasm API tests
+        if: needs.should-run.outputs.wasm_test == 'true'
        run: |
          make test_nodejs_wasm_api_in_docker

      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true' ||
+          needs.should-run.outputs.integer_test == 'true'
        run: |
          make gen_key_cache

      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_shortint_ci

      - name: Run integer tests
+        if: needs.should-run.outputs.integer_test == 'true'
        run: |
          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_integer_ci

-      - name: Run shortint multi-bit tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_shortint_multi_bit_ci
-
-      - name: Run integer multi-bit tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_integer_multi_bit_ci
-
      - name: Run high-level API tests
+        if: needs.should-run.outputs.high_level_api_test == 'true'
        run: |
          make test_high_level_api

@@ -113,17 +212,6 @@ jobs:
        run: |
          make test_safe_deserialization

-      - name: Clone test data
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/tfhe-backward-compat-data
-          path: tfhe/tfhe-backward-compat-data
-          lfs: 'true'
-
-      - name: Run backward compatibility tests
-        run: |
-          make test_backward_compatibility_ci
-
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
@@ -140,7 +228,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -19,25 +19,55 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
-  schedule:
-    # Nightly tests @ 3AM after each work day
-    - cron: "0 3 * * MON-FRI"

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.integer_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            integer:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+
  setup-instance:
    name: Setup instance (unsigned-integer-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+    needs: should-run
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,21 +80,21 @@ jobs:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
+      group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -100,12 +130,12 @@ jobs:
  teardown-instance:
    name: Teardown instance (unsigned-integer-tests)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, unsigned-integer-tests ]
+    needs: [setup-instance, unsigned-integer-tests]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -19,25 +19,55 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
-  schedule:
-    # Nightly tests @ 3AM after each work day
-    - cron: "0 3 * * MON-FRI"

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.integer_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            integer:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+
  setup-instance:
-    name: Setup instance (signed-integer-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+    name: Setup instance (unsigned-integer-tests)
+    needs: should-run
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,21 +80,21 @@ jobs:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
+      group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -104,12 +134,12 @@ jobs:
  teardown-instance:
    name: Teardown instance (signed-integer-tests)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, signed-integer-tests ]
+    needs: [setup-instance, signed-integer-tests]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -63,7 +63,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -86,6 +86,8 @@ jobs:
            high_level_api:
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
+              - '!tfhe/src/boolean/**'
+              - '!tfhe/src/js_on_wasm_api/**'
            c_api:
              - tfhe/src/**
            examples:
@@ -121,7 +123,7 @@ jobs:
  setup-instance:
    name: Setup instance (cpu-tests)
    if: github.event_name != 'pull_request' ||
-      (github.event_name == 'pull_request' && needs.should-run.outputs.any_file_changed == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
    needs: should-run
    runs-on: ubuntu-latest
    outputs:
@@ -129,7 +131,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -158,7 +160,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -235,7 +237,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -54,7 +54,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -90,7 +90,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -3,30 +3,9 @@ name: Boolean benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
@@ -34,36 +13,60 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-boolean-benchmarks:
-    name: Execute boolean benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (boolean-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench

+  boolean-benchmarks:
+    name: Execute boolean benchmarks in EC2
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    continue-on-error: true
+    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -73,14 +76,12 @@ jobs:

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
@@ -97,7 +98,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
@@ -129,8 +130,28 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (boolean-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, boolean-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (boolean-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -19,14 +19,21 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest-large, large_windows_16_latest]
+        # GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
+        # even with a few PRs
+        os: [large_ubuntu_16, macos-latest, windows-latest]
      fail-fast: false

    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
      - name: Install and run newline linter checks
-        if: matrix.os == 'ubuntu-latest'
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
          echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
@@ -36,27 +43,33 @@ jobs:
          make check_newline

      - name: Run pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make pcc

      - name: Build concrete-csprng
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_concrete_csprng

      - name: Build Release core
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_core AVX512_SUPPORT=ON
          make build_core_experimental AVX512_SUPPORT=ON

      - name: Build Release boolean
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_boolean

      - name: Build Release shortint
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_shortint

      - name: Build Release integer
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_integer

@@ -65,10 +78,12 @@ jobs:
          make build_tfhe_full

      - name: Build Release c_api
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_c_api

      - name: Build coverage tests
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_tfhe_coverage

--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -10,7 +10,7 @@ jobs:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
        with:
-          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\:) .+$'
+          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
          flags: "gs"
          error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
          excludeDescription: "true" # optional: this excludes the description body of a pull request
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,13 +51,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files_yaml: |
            tfhe:
@@ -125,7 +125,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -3,30 +3,6 @@ name: Core crypto benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"

 env:
  CARGO_TERM_COLOR: always
@@ -34,36 +10,59 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-core-crypto-benchmarks:
-    name: Execute core crypto benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (core-crypto-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench

+  core-crypto-benchmarks:
+    name: Execute core crypto benchmarks in EC2
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -75,21 +74,19 @@ jobs:

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --name-suffix avx512 \
          --walk-subdirs \
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -121,8 +118,28 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (core-crypto-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, core-crypto-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
@@ -83,7 +83,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -128,7 +128,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -175,7 +175,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -54,7 +54,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -78,7 +78,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -0,0 +1,123 @@
+name: Close or Merge corresponding PR on the data repo
+
+# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
+
+env:
+  TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
+
+# only trigger on pull request closed events
+on:
+  pull_request:
+    types: [ closed ]
+
+# The same pattern is used for jobs that use the github api:
+# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
+# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
+# - "set +e" will make sure we reach the last "echo EOF" even in case of error
+# - "set -o" pipefail makes one line piped command return the error of the first failure
+# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
+# the script will always return 0 because of the "echo EOF".
+
+
+jobs:
+  auto_close_job:
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Find corresponding Pull Request in the data repo
+      run: |
+        {
+          set +e
+          set -o pipefail
+          echo 'TARGET_REPO_PR<<EOF'
+          curl --fail-with-body --no-progress-meter -L -X GET \
+          -H "Accept: application/vnd.github+json" \
+          -H "X-GitHub-Api-Version: 2022-11-28"  \
+          ${{ env.TARGET_REPO_API_URL }}/pulls\?head=${{ github.repository_owner }}:${{ env.PR_BRANCH }} | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
+          RES="$?"
+          echo EOF
+        } >> "${GITHUB_ENV}"
+        exit $RES
+
+    - name: Comment on the PR to indicate the reason of the close
+      run: |
+        {
+          set +e
+          set -o pipefail
+          echo 'GH_API_RES<<EOF'
+          curl --fail-with-body --no-progress-meter -L -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          ${{ fromJson(env.TARGET_REPO_PR).comments_url }} \
+          -d '{ "body": "PR ${{ env.CLOSE_TYPE }}d because the corresponding PR in main repo was ${{ env.CLOSE_TYPE }}d: ${{ github.repository }}#${{ github.event.number  }}" }'
+          RES="$?"
+          echo EOF
+        } >> "${GITHUB_ENV}"
+        exit $RES
+
+    - name: Merge the Pull Request in the data repo
+      if: ${{ github.event.pull_request.merged }}
+      run: |
+        {
+          set +e
+          set -o pipefail
+          echo 'GH_API_RES<<EOF'
+          curl --fail-with-body --no-progress-meter -L -X PUT \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          ${{ fromJson(env.TARGET_REPO_PR).url }}/merge \
+          -d '{ "merge_method": "rebase" }'
+          RES="$?"
+          echo EOF
+        } >> "${GITHUB_ENV}"
+        exit $RES
+
+    - name: Close the Pull Request in the data repo
+      if: ${{ !github.event.pull_request.merged }}
+      run: |
+        {
+          set +e
+          set -o pipefail
+          echo 'GH_API_RES<<EOF'
+          curl --fail-with-body --no-progress-meter -L -X PATCH \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          ${{ fromJson(env.TARGET_REPO_PR).url }} \
+          -d '{ "state": "closed" }'
+          RES="$?"
+          echo EOF
+        } >> "${GITHUB_ENV}"
+        exit $RES
+
+    - name: Delete the associated branch in the data repo
+      run: |
+        {
+          set +e
+          set -o pipefail
+          echo 'GH_API_RES<<EOF'
+          curl --fail-with-body --no-progress-meter -L -X DELETE \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          ${{ env.TARGET_REPO_API_URL }}/git/refs/heads/${{ env.PR_BRANCH }}
+          RES="$?"
+          echo EOF
+        } >> "${GITHUB_ENV}"
+        exit $RES
+
+    - name: Slack Notification
+      if: ${{ always() && job.status == 'failure' }}
+      continue-on-error: true
+      uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+      env:
+        SLACK_COLOR: ${{ job.status }}
+        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -1,5 +1,5 @@
-# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
-name: TFHE Cuda Backend - 4090 full benchmarks
+# Run benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
+name: TFHE Cuda Backend - 4090 benchmarks

 env:
  CARGO_TERM_COLOR: always
@@ -11,6 +11,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  FAST_BENCH: TRUE

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -23,8 +24,10 @@ on:

 jobs:
  cuda-integer-benchmarks:
-    name: Cuda integer benchmarks for all operations flavor  (RTX 4090)
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
+    name: Cuda integer benchmarks (RTX 4090)
+    if: ${{ github.event_name == 'workflow_dispatch' ||
+      github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs' ||
+      contains(github.event.label.name, '4090_bench') }}
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -33,9 +36,6 @@ jobs:
    strategy:
      fail-fast: false
      max-parallel: 1
-      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]

    steps:
      - name: Checkout tfhe-rs
@@ -50,9 +50,10 @@ jobs:
            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -65,7 +66,7 @@ jobs:

      - name: Run integer benchmarks
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+          make BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu

      - name: Parse results
        run: |
@@ -81,9 +82,9 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -133,7 +134,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -144,7 +145,7 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Run integer benchmarks
+      - name: Run core crypto benchmarks
        run: |
          make bench_pbs_gpu
          make bench_ks_gpu
@@ -163,7 +164,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -39,7 +39,7 @@ jobs:
          persist-credentials: 'false'

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -0,0 +1,199 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Fast tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+      types: [ labeled ]
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - Makefile
+              - '.github/workflows/gpu_fast_h100_tests.yml'
+              - scripts/**
+              - ci/**
+
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 tests
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run core crypto and internal CUDA backend tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
+
+      - name: Run user docs tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -11,6 +11,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -18,26 +19,64 @@ on:
  pull_request:

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - '.github/workflows/gpu_fast_tests.yml'
+              - Makefile
+              - scripts/**
+              - ci/**
+
  setup-instance:
    name: Setup instance (cuda-tests)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      needs.should-run.outputs.gpu_test == 'true'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: gpu-test

  cuda-tests-linux:
    name: CUDA tests
-    needs: setup-instance
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -49,11 +88,23 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
@@ -64,7 +115,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -87,6 +138,10 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
@@ -104,13 +159,18 @@ jobs:
        run: |
          make test_high_level_api_gpu

-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    continue-on-error: true
+    steps:
+      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -120,7 +180,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -11,33 +11,74 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+    types: [ labeled ]

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - Makefile
+              - '.github/workflows/**_multi_gpu_tests.yml'
+              - scripts/**
+              - ci/**
+
  setup-instance:
    name: Setup instance (cuda-tests-multi-gpu)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: multi-gpu-test

  cuda-tests-linux:
    name: CUDA multi-GPU tests
-    needs: [ setup-instance ]
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -49,20 +90,34 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -85,29 +140,39 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

-      - name: Run core crypto, integer and internal CUDA backend tests
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
+      - name: Run multi-bit CUDA integer tests
        run: |
-          make test_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci

      - name: Run user docs tests
        run: |
-          make test_user_doc_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu

      - name: Test C API
        run: |
-          make test_c_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu

-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    continue-on-error: true
+    steps:
+      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests-multi-gpu)
@@ -117,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_gpu_pcc.yml
+++ b/.github/workflows/aws_tfhe_gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -109,7 +109,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -1,5 +1,5 @@
-# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: TFHE Cuda Backend - Full tests on H100
+# Signed integer GPU tests on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Signed integer tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -11,22 +11,61 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+      types: [ labeled ]

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - Makefile
+              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
+              - scripts/**
+              - ci/**
+
  setup-instance:
    name: Setup instance (cuda-h100-tests)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -36,8 +75,10 @@ jobs:
          profile: single-h100

  cuda-tests-linux:
-    name: CUDA H100 tests
-    needs: [ setup-instance ]
+    name: CUDA H100 signed integer tests
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -52,22 +93,13 @@ jobs:
            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo \
-          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-           $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -83,7 +115,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -106,31 +138,23 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

-      - name:
+      - name: Check device is detected
        if: ${{ !cancelled() }}
        run: nvidia-smi

-      - name: Run core crypto, integer and internal CUDA backend tests
+      - name: Run signed integer tests
        run: |
-          make test_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci

-      - name: Run user docs tests
+      - name: Run signed integer multi-bit tests
        run: |
-          make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          make test_high_level_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    if: ${{ !success() && !cancelled() }}
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -147,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml
@@ -11,33 +11,82 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  FAST_TESTS: TRUE
+  NIGHTLY_TESTS: FALSE
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+    types:
+      - opened
+      - synchronize
+      - labeled
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - '.github/workflows/gpu_signed_integer_tests.yml'
+              - Makefile
+              - scripts/**
+              - ci/**
+
  setup-instance:
    name: Setup instance (cuda-signed-integer-tests)
    runs-on: ubuntu-latest
+    needs: should-run
+    if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: gpu-test

  cuda-signed-integer-tests:
    name: CUDA signed integer tests
-    needs: setup-instance
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -49,20 +98,34 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11 
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -85,21 +148,34 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

-      - name: Run signed integer tests
+      - name: Should run nightly tests
+        if: github.event_name == 'schedule'
        run: |
-          make test_signed_integer_gpu_ci
+          {
+            echo "FAST_TESTS=FALSE";
+            echo "NIGHTLY_TESTS=TRUE";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi

      - name: Run signed integer multi-bit tests
        run: |
          make test_signed_integer_multi_bit_gpu_ci

-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-signed-integer-tests ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
+    continue-on-error: true
+    steps:
+      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS signed integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -109,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -0,0 +1,188 @@
+# Test unsigned integers on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Unsigned integer tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+      types: [ labeled ]
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - Makefile
+              - '.github/workflows/gpu_unsigned_integer_tests.yml'
+              - scripts/**
+              - ci/**
+
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 unsigned integer tests
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run unsigned integer tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
+
+      - name: Run unsigned integer multi-bit tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -11,33 +11,81 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  FAST_TESTS: TRUE
+  NIGHTLY_TESTS: FALSE

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+    types:
+      - opened
+      - synchronize
+      - labeled
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - '.github/workflows/gpu_unsigned_integer_tests.yml'
+              - Makefile
+              - scripts/**
+              - ci/**
+
  setup-instance:
    name: Setup instance (cuda-unsigned-integer-tests)
+    needs: should-run
+    if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
+          backend: hyperstack
          profile: gpu-test

  cuda-unsigned-integer-tests:
    name: CUDA unsigned integer tests
-    needs: setup-instance
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -49,20 +97,32 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -85,21 +145,34 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

-      - name: Run unsigned integer tests
+      - name: Should run nightly tests
+        if: github.event_name == 'schedule'
        run: |
-          make test_unsigned_integer_gpu_ci
+          {
+            echo "FAST_TESTS=FALSE";
+            echo "NIGHTLY_TESTS=TRUE";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi

      - name: Run unsigned integer multi-bit tests
        run: |
          make test_unsigned_integer_multi_bit_gpu_ci

-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-unsigned-integer-tests ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
+    continue-on-error: true
+    steps:
+      - name: Send message
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS unsigned integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
+          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -109,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -1,130 +0,0 @@
-# Run integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Integer benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute integer benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
-        with:
-          toolchain: nightly
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE bench_integer
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -1,28 +1,20 @@
 # Run all integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Integer full benchmarks
+name: Integer benchmarks

 on:
  workflow_dispatch:
    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+      all_precisions:
+        description: "Run all precisions"
+        type: boolean
+        default: false
+
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+    # Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
+    # These benchmarks are far longer to execute hence the reason to run them only four time a year.
+    - cron: '0 4 25 MAR,JUN,SEP,DEC *'

 env:
  CARGO_TERM_COLOR: always
@@ -30,21 +22,29 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  FAST_BENCH: TRUE

 jobs:
  prepare-matrix:
    name: Prepare operations matrix
    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
    steps:
      - name: Weekly benchmarks
-        if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
+        if: github.event_name == 'workflow_dispatch' ||
+          github.event.schedule == '0 1 * * 6'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

      - name: Quarterly benchmarks
-        if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
+        if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
        run: |
          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"

@@ -53,11 +53,31 @@ jobs:
         run: |
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

+  setup-instance:
+    name: Setup instance (integer-benchmarks)
+    needs: prepare-matrix
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
  integer-benchmarks:
    name: Execute integer benchmarks for all operations flavor
-    needs: prepare-matrix
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+    needs: [ prepare-matrix, setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    continue-on-error: true
    timeout-minutes: 1440  # 24 hours
    strategy:
@@ -66,13 +86,6 @@ jobs:
        command: [ integer, integer_multi_bit]
        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
@@ -92,7 +105,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -103,6 +116,11 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Should run benchmarks with all precisions
+        if: inputs.all_precisions
+        run: |
+          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
+
      - name: Run benchmarks with AVX512
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
@@ -111,7 +129,7 @@ jobs:
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "hpc7a.96xlarge" \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
@@ -121,7 +139,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -140,19 +158,34 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-  slack-notification:
-    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
-    needs: integer-benchmarks
-    steps:
-      - name: Notify
+      - name: Slack Notification
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (integer-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, integer-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -23,14 +23,14 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-integer-benchmarks)
    runs-on: ubuntu-latest
-    if:  github.event_name != 'push' ||
+    if:  github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
@@ -86,7 +86,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -110,6 +110,10 @@ jobs:
            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
          } >> "${GITHUB_ENV}"

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run benchmarks with AVX512
        run: |
          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
@@ -120,7 +124,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -140,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -170,7 +174,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -187,7 +191,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -57,7 +57,7 @@ jobs:
            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
@@ -90,7 +90,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -121,6 +121,10 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run benchmarks with AVX512
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
@@ -140,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -180,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -1,130 +0,0 @@
-# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Integer Multi-bit benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute integer multi-bit benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
-        with:
-          toolchain: nightly
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE bench_integer_multi_bit
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -4,10 +4,14 @@ name: Integer GPU Multi-bit benchmarks
 on:
  workflow_dispatch:
    inputs:
-      full_benchmark:
+      all_precisions:
        description: "Run all precisions"
        type: boolean
        default: false
+      fast_default:
+        description: "Run only deduplicated default operations without scalar variants"
+        type: boolean
+        default: false

  schedule:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
@@ -25,6 +29,7 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE
+  BENCH_OP_FLAVOR: default

 jobs:
  setup-instance:
@@ -37,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -61,7 +66,7 @@ jobs:
            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
@@ -94,7 +99,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -118,14 +123,23 @@ jobs:
            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
          } >> "${GITHUB_ENV}"

-      - name: Should run full benchmarks
-        if: inputs.full_benchmark
+      - name: Should run benchmarks with all precisions
+        if: inputs.all_precisions
        run: |
          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"

+      - name: Should run fast subset benchmarks
+        if: inputs.fast_default
+        run: |
+          echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make FAST_BENCH=${{ env.FAST_BENCH }} BENCH_OP_FLAVOR=default bench_unsigned_integer_multi_bit_gpu
+          make bench_unsigned_integer_multi_bit_gpu

      - name: Parse benchmarks to csv
        run: |
@@ -133,7 +147,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -153,7 +167,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -201,7 +215,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -4,10 +4,14 @@ name: Integer multi GPU Multi-bit benchmarks
 on:
  workflow_dispatch:
    inputs:
-      full_benchmark:
+      all_precisions:
        description: "Run all precisions"
        type: boolean
        default: false
+      fast_default:
+        description: "Run only deduplicated default operations without scalar variants"
+        type: boolean
+        default: false

  schedule:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
@@ -24,25 +28,28 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE
+  BENCH_OP_FLAVOR: default

 jobs:
  setup-instance:
    name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch' }}
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: multi-gpu-test
+          backend: hyperstack
+          profile: multi-h100

  cuda-integer-multi-bit-multi-gpu-benchmarks:
    name: Execute multi GPU integer multi-bit benchmarks
@@ -57,11 +64,23 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
@@ -81,7 +100,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -112,20 +131,29 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Should run full benchmarks
-        if: inputs.full_benchmark
+      - name: Should run benchmarks with all precisions
+        if: inputs.all_precisions
        run: |
          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"

+      - name: Should run fast subset benchmarks
+        if: inputs.fast_default
+        run: |
+          echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make FAST_BENCH=${{ env.FAST_BENCH }} BENCH_OP_FLAVOR=default bench_unsigned_integer_multi_bit_gpu
+          make bench_unsigned_integer_multi_bit_gpu

      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "p3.8xlarge" \
+          --hardware "n3-H100x8" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -136,7 +164,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -176,7 +204,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -29,14 +29,14 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: multi-gpu-test
+          backend: hyperstack
+          profile: multi-h100

  cuda-integer-full-multi-gpu-benchmarks:
    name: Execute multi GPU integer benchmarks for all operations flavor
@@ -54,11 +54,23 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
@@ -78,7 +90,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -109,6 +121,10 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run benchmarks with AVX512
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
@@ -117,7 +133,7 @@ jobs:
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "p3.8xlarge" \
+          --hardware "n3-H100x8" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -128,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -168,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -39,7 +39,7 @@ jobs:
          persist-credentials: 'false'

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -30,20 +30,62 @@ env:
  NPM_TAG: ""

 jobs:
-  publish_release:
-    name: Publish Release
+  package:
    runs-on: ubuntu-latest
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+        with:
+          name: crate
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+  publish_release:
+    name: Publish Release
+    needs: [package] # for comparing hashes
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
      - name: Create NPM version tag
        if: ${{ inputs.npm_latest_tag }}
        run: |
          echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
-
+      - name: Download artifact
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: crate
+          path: target/package
      - name: Publish crate.io package
        if: ${{ inputs.push_to_crates }}
        env:
@@ -52,6 +94,22 @@ jobs:
        run: |
          cargo publish -p tfhe --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}

+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: failure
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "SLSA tfhe crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
      - name: Build web package
        if: ${{ inputs.push_web_package }}
        run: |
@@ -65,6 +123,7 @@ jobs:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
          tag: ${{ env.NPM_TAG }}
+          provenance: true

      - name: Build Node package
        if: ${{ inputs.push_node_package }}
@@ -82,6 +141,7 @@ jobs:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
          tag: ${{ env.NPM_TAG }}
+          provenance: true

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -1,4 +1,3 @@
-# Publish new release of tfhe-rs on various platform.
 name: Publish concrete-csprng release

 on:
@@ -37,6 +36,6 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_concrete_tfhe_versionable.yml
+++ b/.github/workflows/make_release_concrete_tfhe_versionable.yml
@@ -0,0 +1,36 @@
+name: Publish tfhe-versionable release
+
+on:
+  workflow_dispatch:
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  publish_release:
+    name: Publish tfhe-versionable Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+        run: |
+          cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
+          cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: stable

@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -14,7 +14,7 @@ on:

 jobs:
  params-curves-security-check:
-    runs-on: ubuntu-latest
+    runs-on: large_ubuntu_16
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
@@ -24,7 +24,7 @@ jobs:
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
-          ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
+          ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'

      - name: Install Sage
        run: |
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -1,128 +0,0 @@
-# Run shortint benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Shortint benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-jobs:
-  run-shortint-benchmarks:
-    name: Execute shortint benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
-        with:
-          toolchain: nightly
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make bench_shortint
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Measure key sizes
-        run: |
-          make measure_shortint_key_sizes
-
-      - name: Parse key sizes results
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
-          --key-sizes \
-          --append-results
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_shortint
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -3,30 +3,13 @@ name: Shortint full benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+    # Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
+    # These benchmarks are far longer to execute hence the reason to run them only four time a year.
+    - cron: '0 4 25 MAR,JUN,SEP,DEC *'
+

 env:
  CARGO_TERM_COLOR: always
@@ -34,24 +17,67 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
+  prepare-matrix:
+    name: Prepare operations matrix
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+    steps:
+      - name: Weekly benchmarks
+        if: github.event_name == 'workflow_dispatch' ||
+          github.event.schedule == '0 1 * * 6'
+        run: |
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+
+      - name: Quarterly benchmarks
+        if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
+        run: |
+          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\"]" >> "${GITHUB_ENV}"
+
+      - name: Set operation flavor output
+        id: set_op_flavor
+        run: |
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: Setup instance (shortint-benchmarks)
+    needs: prepare-matrix
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
  shortint-benchmarks:
    name: Execute shortint benchmarks for all operations flavor
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+    needs: [ prepare-matrix, setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    continue-on-error: true
    strategy:
      max-parallel: 1
      matrix:
-        op_flavor: [ default, smart, unchecked ]
+        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
@@ -71,7 +97,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -92,7 +118,7 @@ jobs:
          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "hpc7a.96xlarge" \
          --project-version "${COMMIT_HASH}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${COMMIT_DATE}" \
@@ -115,7 +141,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -134,19 +160,34 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-  slack-notification:
-    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
-    needs: shortint-benchmarks
-    steps:
-      - name: Notify
+      - name: Slack Notification
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (shortint-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, shortint-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (shortint-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -1,130 +0,0 @@
-# Run signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute signed integer benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
-        with:
-          toolchain: nightly
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE bench_signed_integer
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_cpu_benchmark.yml
+++ b/.github/workflows/signed_integer_cpu_benchmark.yml
@@ -0,0 +1,191 @@
+# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: Signed Integer full benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      all_precisions:
+        description: "Run all precisions"
+        type: boolean
+        default: false
+
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+    # Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
+    # These benchmarks are far longer to execute hence the reason to run them only four time a year.
+    - cron: '0 4 25 MAR,JUN,SEP,DEC *'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  FAST_BENCH: TRUE
+
+jobs:
+  prepare-matrix:
+    name: Prepare operations matrix
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+    steps:
+      - name: Weekly benchmarks
+        if: github.event_name == 'workflow_dispatch' ||
+          github.event.schedule == '0 1 * * 6'
+        run: |
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+
+      - name: Quarterly benchmarks
+        if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
+        run: |
+          echo "OP_FLAVOR=[\"default\", \"unchecked\"]" >> "${GITHUB_ENV}"
+
+      - name: Set operation flavor output
+        id: set_op_flavor
+        run: |
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: Setup instance (signed-integer-benchmarks)
+    needs: prepare-matrix
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  signed-integer-benchmarks:
+    name: Execute signed integer benchmarks for all operations flavor
+    needs: [ prepare-matrix, setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    continue-on-error: true
+    timeout-minutes: 1440  # 24 hours
+    strategy:
+      max-parallel: 1
+      matrix:
+        command: [ integer, integer_multi_bit ]
+        op_flavor: [ default, unchecked ]
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Should run benchmarks with all precisions
+        if: inputs.all_precisions
+        run: |
+          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (integer-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, signed-integer-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (signed-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -1,136 +0,0 @@
-# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer full benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-jobs:
-  integer-benchmarks:
-    name: Execute signed integer benchmarks for all operations flavor
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    continue-on-error: true
-    timeout-minutes: 1440  # 24 hours
-    strategy:
-      max-parallel: 1
-      matrix:
-        command: [ integer, integer_multi_bit ]
-        op_flavor: [ default, unchecked ]
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-  slack-notification:
-    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
-    needs: integer-benchmarks
-    steps:
-      - name: Notify
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -1,130 +0,0 @@
-# Run signed integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer Multi-bit benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute signed integer multi-bit benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
-        with:
-          toolchain: nightly
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE bench_signed_integer_multi_bit
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -1,123 +0,0 @@
-# Start all benchmark jobs on Slab CI bot.
-name: Start all benchmarks
-
-on:
-  push:
-    branches:
-      - "main"
-  workflow_dispatch:
-    inputs:
-      # The input name must be the name of the slab command to launch
-      boolean_bench:
-        description: "Run Boolean benches"
-        type: boolean
-        default: true
-      shortint_bench:
-        description: "Run shortint benches"
-        type: boolean
-        default: true
-      integer_bench:
-        description: "Run integer benches"
-        type: boolean
-        default: true
-      signed_integer_bench:
-        description: "Run signed integer benches"
-        type: boolean
-        default: true
-      integer_multi_bit_bench:
-        description: "Run integer multi bit benches"
-        type: boolean
-        default: true
-      signed_integer_multi_bit_bench:
-        description: "Run signed integer multi bit benches"
-        type: boolean
-        default: true
-      core_crypto_bench:
-        description: "Run core crypto benches"
-        type: boolean
-        default: true
-
-jobs:
-  start-benchmarks:
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
-    strategy:
-      matrix:
-        command: [ boolean_bench, shortint_bench,
-                   integer_bench, integer_multi_bit_bench,
-                   signed_integer_bench, signed_integer_multi_bit_bench,
-                   core_crypto_bench ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
-        with:
-          files_yaml: |
-            common_benches:
-              - toolchain.txt
-              - Makefile
-              - ci/slab.toml
-              - tfhe/Cargo.toml
-              - tfhe/src/core_crypto/**
-              - .github/workflows/start_benchmarks.yml
-            boolean_bench:
-              - tfhe/src/boolean/**
-              - tfhe/benches/boolean/**
-              - .github/workflows/boolean_benchmark.yml
-            shortint_bench:
-              - tfhe/src/shortint/**
-              - tfhe/benches/shortint/**
-              - .github/workflows/shortint_benchmark.yml
-            integer_bench:
-              - tfhe/src/shortint/**
-              - tfhe/src/integer/**
-              - tfhe/benches/integer/bench.rs
-              - .github/workflows/integer_benchmark.yml
-            integer_multi_bit_bench:
-              - tfhe/src/shortint/**
-              - tfhe/src/integer/**
-              - tfhe/benches/integer/bench.rs
-              - .github/workflows/integer_multi_bit_benchmark.yml
-            signed_integer_bench:
-              - tfhe/src/shortint/**
-              - tfhe/src/integer/**
-              - tfhe/benches/integer/signed_bench.rs
-              - .github/workflows/signed_integer_benchmark.yml
-            signed_integer_multi_bit_bench:
-              - tfhe/src/shortint/**
-              - tfhe/src/integer/**
-              - tfhe/benches/integer/signed_bench.rs
-              - .github/workflows/signed_integer_multi_bit_benchmark.yml
-            core_crypto_bench:
-              - tfhe/src/core_crypto/**
-              - tfhe/benches/core_crypto/**
-              - .github/workflows/core_crypto_benchmark.yml
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Start AWS job in Slab
-        # If manually triggered check that the current bench has been requested
-        # Otherwise if it's on push check that files relevant to benchmarks have changed
-        if: (github.event_name == 'workflow_dispatch' && github.event.inputs[matrix.command] == 'true') || (github.event_name == 'push' && (steps.changed-files.outputs.common_benches_any_changed == 'true' || steps.changed-files.outputs[format('{0}_any_changed', matrix.command)] == 'true'))
-        shell: bash
-        run: |
-          echo -n '{"command": "${{ matrix.command }}", "git_ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' > command.json
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh command.json '${{ secrets.JOB_SECRET }}')"
-          curl -v -k \
-          --fail-with-body \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: start_aws" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @command.json \
-          ${{ secrets.SLAB_URL }}
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -1,66 +0,0 @@
-# Start all benchmark jobs, including full shortint and integer, on Slab CI bot.
-name: Start full suite benchmarks
-
-on:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-    # Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
-    # These benchmarks are far longer to execute hence the reason to run them only four time a year.
-    - cron: '0 4 25 MAR,JUN,SEP,DEC *'
-  workflow_dispatch:
-    inputs:
-      benchmark_type:
-        description: 'Benchmark type'
-        required: true
-        default: 'weekly'
-        type: choice
-        options:
-          - weekly
-          - quarterly
-
-jobs:
-  start-benchmarks:
-    if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
-    strategy:
-      matrix:
-        command: [ boolean_bench, shortint_full_bench,
-                   integer_full_bench, signed_integer_full_bench,
-                   core_crypto_bench, wasm_client_bench ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          fetch-depth: 0
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Set benchmarks type as weekly
-        if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'weekly') || github.event.schedule == '0 1 * * 6'
-        run: |
-          echo "BENCH_TYPE=weekly_benchmarks" >> "${GITHUB_ENV}"
-
-      - name: Set benchmarks type as quarterly
-        if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'quarterly') || github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
-        run: |
-          echo "BENCH_TYPE=quarterly_benchmarks" >> "${GITHUB_ENV}"
-
-      - name: Start AWS job in Slab
-        shell: bash
-        run: |
-          echo -n '{"command": "${{ matrix.command }}", "git_ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "user_inputs": "${{ env.BENCH_TYPE }}"}' > command.json
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh command.json '${{ secrets.JOB_SECRET }}')"
-          curl -v -k \
-          --fail-with-body \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: start_aws" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @command.json \
-          ${{ secrets.SLAB_URL }}
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -25,7 +25,8 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
-      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
      pull-requests: write
    outputs:
@@ -38,7 +39,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -53,7 +54,8 @@ jobs:

  setup-instance:
    name: Setup instance (wasm-client-benchmarks)
-    if: github.event_name != 'push' ||
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
    needs: should-run
    runs-on: ubuntu-latest
@@ -62,7 +64,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -73,9 +75,8 @@ jobs:

  wasm-client-benchmarks:
    name: Execute WASM client benchmarks
-    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'push' ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
+    needs: setup-instance
+    if: needs.setup-instance.result != 'skipped'
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
@@ -97,7 +98,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -129,7 +130,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}
@@ -171,7 +172,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/zk_pke_benchmark.yml
+++ b/.github/workflows/zk_pke_benchmark.yml
@@ -24,8 +24,8 @@ env:
 jobs:
  should-run:
    runs-on: ubuntu-latest
-    if: github.event_name != 'push' ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
+    if: github.event_name == 'workflow_dispatch' ||
+      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
    steps:
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -55,7 +55,7 @@ jobs:
    name: Setup instance (pke-zk-benchmarks)
    runs-on: ubuntu-latest
    needs: should-run
-    if: github.event_name != 'push' ||
+    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' &&
      github.repository == 'zama-ai/tfhe-rs' &&
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -76,12 +76,10 @@ jobs:

  pke-zk-benchmarks:
    name: Execute PKE ZK benchmarks
-    if: github.event_name != 'push' ||
-      ((github.event_name == 'push' || github.event_name == 'schedule') &&
-      needs.setup-instance.result != 'skipped')
-    needs: [ should-run, setup-instance ]
+    if: needs.setup-instance.result != 'skipped'
+    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -104,7 +102,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
        with:
          toolchain: nightly

@@ -140,7 +138,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}
@@ -182,7 +180,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ target/
 # In case of symlinked keys
 /keys

+**/*.rmeta
 **/Cargo.lock
 **/*.bin

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,10 +8,13 @@ members = [
    "concrete-csprng",
    "backends/tfhe-cuda-backend",
    "utils/tfhe-versionable",
-    "utils/tfhe-versionable-derive"
+    "utils/tfhe-versionable-derive",
 ]
+
 exclude = [
-    "tfhe/backward_compatibility_tests"
+    "tfhe/backward_compatibility_tests",
+    "utils/cargo-tfhe-lints-inner",
+    "utils/cargo-tfhe-lints"
 ]

 [profile.bench]
--- a/97
+++ b/97
@@ -16,21 +16,15 @@ GEN_KEY_CACHE_COVERAGE_ONLY?=FALSE
 PARSE_INTEGER_BENCH_CSV_FILE?=tfhe_rs_integer_benches.csv
 FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
+NIGHTLY_TESTS?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
-NODE_VERSION=20
+NODE_VERSION=22.6
 FORWARD_COMPAT?=OFF
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_DIR=tfhe-backward-compat-data
-# sed: -n, do not print input stream, -e means a script/expression
-# 1,/version/ indicates from the first line, to the line matching version at the start of the line
-# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
-# entry which should be the version of tfhe
-TFHE_CURRENT_VERSION:=\
-$(shell sed -n -e '1,/^version/p' tfhe/Cargo.toml | \
-grep '^version[[:space:]]*=' | cut -d '=' -f 2 | xargs)
-# Cargo has a hard time distinguishing between our package from the workspace and a package that
-# could be a dependency, so we build an unambiguous spec here
-TFHE_SPEC:=tfhe@$(TFHE_CURRENT_VERSION)
+BACKWARD_COMPAT_DATA_BRANCH?=v0.1
+BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
+BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
+TFHE_SPEC:=tfhe
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
 # copy paste the command in the terminal and change them if required without forgetting the flags
 export RUSTFLAGS?=-C target-cpu=native
@@ -117,7 +111,7 @@ install_cargo_nextest: install_rs_build_toolchain
 .PHONY: install_wasm_pack # Install wasm-pack to build JS packages
 install_wasm_pack: install_rs_build_toolchain
 	@wasm-pack --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install wasm-pack || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@0.13.0 || \
 	( echo "Unable to install cargo wasm-pack, unknown error." && exit 1 )

 .PHONY: install_node # Install last version of NodeJS via nvm
@@ -147,6 +141,11 @@ install_tarpaulin: install_rs_build_toolchain
 	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-tarpaulin --locked || \
 	( echo "Unable to install cargo tarpaulin, unknown error." && exit 1 )

+.PHONY: install_tfhe_lints # Install custom tfhe-rs lints
+install_tfhe_lints:
+	(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
+	cd utils/cargo-tfhe-lints && cargo install --path .
+
 .PHONY: check_linelint_installed # Check if linelint newline linter is installed
 check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
@@ -266,6 +265,17 @@ clippy: install_rs_check_toolchain
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

+.PHONY: clippy_rustdoc # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
+clippy_rustdoc: install_rs_check_toolchain
+	if [[ "$(OS)" != "Linux" && "$(OS)" != "Darwin" ]]; then \
+		echo "WARNING: skipped clippy_rustdoc, unsupported OS $(OS)"; \
+		exit 0; \
+	fi && \
+	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
+		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats \
+		-p $(TFHE_SPEC)
+
 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -306,18 +316,23 @@ clippy_zk_pok: install_rs_check_toolchain
 		-p tfhe-zk-pok -- --no-deps -D warnings

 .PHONY: clippy_all # Run all clippy targets
-clippy_all: clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets clippy_c_api \
-clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
+clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium

 .PHONY: clippy_fast # Run main clippy targets
-clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
-clippy_concrete_csprng
+clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
+clippy_core clippy_concrete_csprng

 .PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
 clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings

+.PHONY: tfhe_lints # Run custom tfhe-rs lints
+tfhe_lints: install_tfhe_lints
+	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings
+
 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
@@ -402,7 +417,8 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
 		wasm-pack build --release --target=web \
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
-		-Z build-std=panic_abort,std
+		-Z build-std=panic_abort,std && \
+	find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;

 .PHONY: build_node_js_api # Build the js API targeting nodejs
 build_node_js_api: install_rs_build_toolchain install_wasm_pack
@@ -445,8 +461,8 @@ test_cuda_backend:
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
 		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
-		make -j "$(CPU_COUNT)" && \
-		make test
+		"$(MAKE)" -j "$(CPU_COUNT)" && \
+		"$(MAKE)" test

 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
@@ -469,6 +485,7 @@ test_integer_gpu: install_rs_build_toolchain
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
 		--tfhe-package "$(TFHE_SPEC)"
@@ -477,6 +494,7 @@ test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
 		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
@@ -485,6 +503,7 @@ test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"
@@ -493,6 +512,7 @@ test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--tfhe-package "$(TFHE_SPEC)"
@@ -501,6 +521,7 @@ test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
@@ -509,6 +530,7 @@ test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo
 test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"
@@ -575,6 +597,7 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 test_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--tfhe-package "$(TFHE_SPEC)"
@@ -583,6 +606,7 @@ test_integer_ci: install_rs_check_toolchain install_cargo_nextest
 test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
@@ -591,6 +615,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
 test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"
@@ -599,6 +624,7 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
 test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--tfhe-package "$(TFHE_SPEC)"
@@ -607,6 +633,7 @@ test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
@@ -615,6 +642,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
 test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
+	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"
@@ -708,14 +736,21 @@ test_versionable: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-versionable

+# The backward compat data repo holds historical binary data but also rust code to generate and load them.
+# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
 		--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
 test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci

+.PHONY: backward_compat_branch # Prints the required backward compatibility branch
+backward_compat_branch:
+	@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
+
 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
 	@# Even though we are not in docs.rs, this allows to "just" build the doc
@@ -778,7 +813,7 @@ check_compile_tests_benches_gpu: install_rs_build_toolchain
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
 		cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
-		make -j "$(CPU_COUNT)"
+		"$(MAKE)" -j "$(CPU_COUNT)"

 .PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
 build_nodejs_test_docker:
@@ -798,7 +833,7 @@ test_nodejs_wasm_api_in_docker: build_nodejs_test_docker

 .PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
 test_nodejs_wasm_api: build_node_js_api
-	cd tfhe && node --test js_on_wasm_tests
+	cd tfhe/js_on_wasm_tests && npm run test

 .PHONY: test_web_js_api_parallel # Run tests for the web wasm api
 test_web_js_api_parallel: build_web_js_api_parallel
@@ -878,7 +913,7 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- unsigned
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
@@ -895,16 +930,12 @@ bench_shortint: install_rs_check_toolchain
 	--bench shortint-bench \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

-.PHONY: bench_oprf # Run benchmarks for shortint
-bench_oprf: install_rs_check_toolchain
+.PHONY: bench_shortint_oprf # Run benchmarks for shortint
+bench_shortint_oprf: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-	RUSTFLAGS="$(RUSTFLAGS)" \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench oprf-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
@@ -934,7 +965,7 @@ bench_pbs128: install_rs_check_toolchain

 .PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
 bench_pbs_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

@@ -957,7 +988,7 @@ bench_web_js_api_parallel: build_web_js_api_parallel
 .PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
 bench_web_js_api_parallel_ci: build_web_js_api_parallel
 	source ~/.nvm/nvm.sh && \
-	nvm use node && \
+	nvm use $(NODE_VERSION) && \
 	$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci

 #
@@ -1016,7 +1047,7 @@ write_params_to_file: install_rs_check_toolchain

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
 clone_backward_compat_data:
-	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
+	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tfhe/$(BACKWARD_COMPAT_DATA_DIR)

 tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data

@@ -1046,7 +1077,7 @@ sha256_bool: install_rs_check_toolchain

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
-clippy_all check_compile_tests
+clippy_all tfhe_lints check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
--- a/apps/trivium/benches/kreyvium_byte.rs
+++ b/apps/trivium/benches/kreyvium_byte.rs
@@ -4,9 +4,8 @@ use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
 use tfhe_trivium::{KreyviumStreamByte, TransCiphering};

 pub fn kreyvium_byte_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
-        .build();
+    let config = ConfigBuilder::default().build();
+
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -33,9 +32,8 @@ pub fn kreyvium_byte_gen(c: &mut Criterion) {
 }

 pub fn kreyvium_byte_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
-        .build();
+    let config = ConfigBuilder::default().build();
+
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -63,9 +61,8 @@ pub fn kreyvium_byte_trans(c: &mut Criterion) {
 }

 pub fn kreyvium_byte_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
-        .build();
+    let config = ConfigBuilder::default().build();
+
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/kreyvium/kreyvium_byte.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_byte.rs
@@ -119,7 +119,7 @@ impl KreyviumStreamByte<FheUint8> {
        }

        // Key and iv are stored in reverse in their shift registers
-        let mut key = key_bytes.map(|b| b.map(|x| (x as u8).reverse_bits() as u64));
+        let mut key = key_bytes.map(|b| b.reverse_bits());
        let mut iv = iv_bytes.map(|x| FheUint8::encrypt_trivial(x.reverse_bits()));
        key.reverse();
        iv.reverse();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -299,9 +299,8 @@ fn kreyvium_test_clear_byte() {

 #[test]
 fn kreyvium_test_byte_long() {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
-        .build();
+    let config = ConfigBuilder::default().build();
+
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -338,9 +337,8 @@ fn kreyvium_test_byte_long() {

 #[test]
 fn kreyvium_test_fhe_byte_transciphering_long() {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
-        .build();
+    let config = ConfigBuilder::default().build();
+
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.3.0"
+version = "0.4.0-alpha.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -8,6 +8,18 @@ fn main() {
        }
    }

+    // This is a workaround to the current nightly toolchain (2024-06-27 which started with
+    // toolchain 2024-05-05) build issue
+    // Essentially if cbindgen is running, a wrong argument ends up forwarded to the cuda backend
+    // "make" command during macro expansions for TFHE-rs C API, crashing make for make < 4.4 and
+    // thus crashing the build
+    // On the other hand, this speeds up C API build greatly given we don't have macro expansions
+    // in the CUDA backend so this skips the second compilation of TFHE-rs for macro inspection by
+    // cbindgen
+    if std::env::var("_CBINDGEN_IS_RUNNING").is_ok() {
+        return;
+    }
+
    println!("Build tfhe-cuda-backend");
    println!("cargo::rerun-if-changed=cuda/include");
    println!("cargo::rerun-if-changed=cuda/src");
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -1,6 +1,7 @@
 #ifndef CUDA_CIPHERTEXT_H
 #define CUDA_CIPHERTEXT_H

+#include "device.h"
 #include <cstdint>

 extern "C" {
@@ -14,5 +15,11 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);
+
+void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
+                                 void *lwe_array_out, void *glwe_array_in,
+                                 uint32_t *nth_array, uint32_t num_glwes,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size);
 };
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -64,14 +64,8 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);

 int cuda_get_max_shared_memory(uint32_t gpu_index);
-
-void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
-                              cudaStreamCallback_t callback, void *user_data);
 }

-void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
-                                  void *host_pointer);
-
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                          Torus *d_array, Torus value, Torus n);
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -1,6 +1,8 @@
 #ifndef HELPER_MULTI_GPU_H
 #define HELPER_MULTI_GPU_H
 #include <mutex>
+#include <variant>
+#include <vector>

 extern std::mutex m;
 extern bool p2p_enabled;
@@ -9,6 +11,20 @@ extern "C" {
 int cuda_setup_multi_gpu();
 }

+// Define a variant type that can be either a vector or a single pointer
+template <typename Torus>
+using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
+
+// Macro to define the visitor logic using std::holds_alternative for vectors
+#define GET_VARIANT_ELEMENT(variant, index)                                    \
+  [&] {                                                                        \
+    if (std::holds_alternative<std::vector<Torus *>>(variant)) {               \
+      return std::get<std::vector<Torus *>>(variant)[index];                   \
+    } else {                                                                   \
+      return std::get<Torus *>(variant);                                       \
+    }                                                                          \
+  }()
+
 int get_active_gpu_count(int num_inputs, int gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -35,8 +35,11 @@ enum COMPARISON_TYPE {
  MAX = 6,
  MIN = 7,
 };
+
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

+enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
+
 extern "C" {
 void scratch_cuda_apply_univariate_lut_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
@@ -81,9 +84,8 @@ void scratch_cuda_full_propagation_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);

 void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
                                      uint32_t gpu_count, void *input_blocks,
@@ -99,7 +101,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);

 void cuda_integer_mult_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -281,7 +283,7 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
+void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -290,15 +292,14 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory);

-void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
+void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix);

-void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
-                                                    uint32_t *gpu_indexes,
-                                                    uint32_t gpu_count,
-                                                    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);

 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
    void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
@@ -355,6 +356,48 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
                                  uint32_t gpu_count, int8_t **mem_ptr_void);

+void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory);
+
+void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
+    void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks_in_radix);
+
+void cleanup_signed_overflowing_add_or_sub(void **streams,
+                                           uint32_t *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void);
+
+void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+
+void cuda_integer_compute_prefix_sum_hillis_steele_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t shift);
+
+void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+void cuda_integer_reverse_blocks_64_inplace(void **streams,
+                                            uint32_t *gpu_indexes,
+                                            uint32_t gpu_count, void *lwe_array,
+                                            uint32_t num_blocks,
+                                            uint32_t lwe_size);
+
 } // extern C

 template <typename Torus>
@@ -466,11 +509,21 @@ template <typename Torus> struct int_radix_lut {
  // for the moment
  Torus *lwe_indexes_in;
  Torus *lwe_indexes_out;
+  Torus *h_lwe_indexes_in;
+  Torus *h_lwe_indexes_out;
+  // Enable optimizations if lwe_indexes_(in/out) are trivial
+  bool using_trivial_lwe_indexes = true;
  // lwe_trivial_indexes is the intermediary index we need in case
  // lwe_indexes_in != lwe_indexes_out
  Torus *lwe_trivial_indexes;
  Torus *tmp_lwe_before_ks;
-  Torus *tmp_lwe_after_ks;
+
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec;

  int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
@@ -490,13 +543,12 @@ template <typename Torus> struct int_radix_lut {
      cudaSetDevice(i);
      int8_t *gpu_pbs_buffer;
      auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count);
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);

      execute_scratch_pbs<Torus>(
          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
-          params.grouping_factor, num_blocks_on_gpu,
-          cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type,
+          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
          allocate_gpu_memory);
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      buffer.push_back(gpu_pbs_buffer);
@@ -530,22 +582,43 @@ template <typename Torus> struct int_radix_lut {
          num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
      lwe_trivial_indexes = (Torus *)cuda_malloc_async(
          num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-      auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+      h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+      h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

      for (int i = 0; i < num_radix_blocks; i++)
-        h_lwe_indexes[i] = i;
+        h_lwe_indexes_in[i] = i;

-      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                               num_radix_blocks * sizeof(Torus), streams[0],
                               gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                               num_radix_blocks * sizeof(Torus), streams[0],
                               gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                               num_radix_blocks * sizeof(Torus), streams[0],
                               gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);
+      memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+             num_radix_blocks * sizeof(Torus));
+
+      /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+      /// copy data on each GPU then when we gather data to GPU 0 we can copy
+      /// back to the original indexing
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_array_in_vec, num_radix_blocks,
+                                params.big_lwe_dimension + 1);
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_after_ks_vec, num_radix_blocks,
+                                params.small_lwe_dimension + 1);
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_after_pbs_vec, num_radix_blocks,
+                                params.big_lwe_dimension + 1);
+      multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
+                                  lwe_trivial_indexes_vec, num_radix_blocks);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+                                 lwe_trivial_indexes_vec, lwe_trivial_indexes,
+                                 num_radix_blocks);

      // Keyswitch
      Torus big_size =
@@ -554,8 +627,6 @@ template <typename Torus> struct int_radix_lut {
          (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
      tmp_lwe_before_ks =
          (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
-      tmp_lwe_after_ks =
-          (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
    }
  }

@@ -577,7 +648,14 @@ template <typename Torus> struct int_radix_lut {
    buffer = base_lut_object->buffer;
    // Keyswitch
    tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
-    tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
+
+    /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+    /// copy data on each GPU then when we gather data to GPU 0 we can copy back
+    /// to the original indexing
+    lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
+    lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
+    lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
+    lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;

    mem_reuse = true;

@@ -609,22 +687,24 @@ template <typename Torus> struct int_radix_lut {
        num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
    lwe_trivial_indexes = (Torus *)cuda_malloc_async(
        num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-    auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+    h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+    h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));

    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
+      h_lwe_indexes_in[i] = i;

-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                             num_radix_blocks * sizeof(Torus), streams[0],
                             gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                             num_radix_blocks * sizeof(Torus), streams[0],
                             gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                             num_radix_blocks * sizeof(Torus), streams[0],
                             gpu_indexes[0]);
-    cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                             host_free_on_stream_callback, h_lwe_indexes);
+    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+           num_radix_blocks * sizeof(Torus));
  }

  // Return a pointer to idx-ith lut at gpu_index's global memory
@@ -642,6 +722,22 @@ template <typename Torus> struct int_radix_lut {
    return &lut_indexes[ind];
  }

+  // If this function is called we assume the lwe_indexes_(in/out) are not the
+  // trivial anymore and thus we disable optimizations
+  void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
+                       Torus *h_indexes_in, Torus *h_indexes_out) {
+
+    memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
+    memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
+
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+
+    using_trivial_lwe_indexes = false;
+  }
+
  // Broadcast luts from gpu src_gpu_idx to all active gpus
  void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                     uint32_t src_gpu_idx) {
@@ -651,7 +747,6 @@ template <typename Torus> struct int_radix_lut {
    auto src_lut_indexes = lut_indexes_vec[src_gpu_idx];

    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-#pragma omp parallel for num_threads(active_gpu_count)
    for (uint i = 0; i < active_gpu_count; i++) {
      if (i != src_gpu_idx) {
        auto dst_lut = lut_vec[i];
@@ -669,7 +764,6 @@ template <typename Torus> struct int_radix_lut {

  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
               uint32_t gpu_count) {
-#pragma omp parallel for num_threads(active_gpu_count)
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
      cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
@@ -680,9 +774,13 @@ template <typename Torus> struct int_radix_lut {
    cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]);
    cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]);
    cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]);
+
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    free(h_lwe_indexes_in);
+    free(h_lwe_indexes_out);
+
    if (!mem_reuse) {
      cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]);
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
      for (int i = 0; i < buffer.size(); i++) {
        switch (params.pbs_type) {
@@ -700,6 +798,17 @@ template <typename Torus> struct int_radix_lut {
        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      }
      buffer.clear();
+
+      multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
+      for (uint i = 0; i < active_gpu_count; i++)
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      lwe_array_in_vec.clear();
+      lwe_after_ks_vec.clear();
+      lwe_after_pbs_vec.clear();
+      lwe_trivial_indexes_vec.clear();
    }
  }
 };
@@ -749,8 +858,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
          num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
          gpu_indexes[0]);
      lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lut_indexes);

      /**
       * the input indexes should take the first bits_per_block PBS to target
@@ -763,12 +870,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
        for (int i = 0; i < bits_per_block; i++)
          h_lwe_indexes_in[i + j * bits_per_block] = j;
      }
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_in);

      /**
       * the output should aim different lwe ciphertexts, so lwe_indexes_out =
@@ -780,12 +881,13 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
      for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
        h_lwe_indexes_out[i] = i;

-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes_out);
+      lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
+                           h_lwe_indexes_out);
+
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      free(h_lut_indexes);
+      free(h_lwe_indexes_in);
+      free(h_lwe_indexes_out);
    }
  }

@@ -955,10 +1057,10 @@ template <typename Torus> struct int_fullprop_buffer {

  int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t gpu_count, int_radix_params params,
-                      uint32_t num_radix_blocks, bool allocate_gpu_memory) {
+                      bool allocate_gpu_memory) {
    this->params = params;
-    lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2,
-                                   num_radix_blocks, allocate_gpu_memory);
+    lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2, 2,
+                                   allocate_gpu_memory);

    if (allocate_gpu_memory) {

@@ -984,15 +1086,13 @@ template <typename Torus> struct int_fullprop_buffer {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_carry);

-      Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
+      Torus lwe_indexes_size = 2 * sizeof(Torus);
      Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
-      for (int i = 0; i < num_radix_blocks; i++)
+      for (int i = 0; i < 2; i++)
        h_lwe_indexes[i] = i;
      Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0);
      cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
                               streams[0], gpu_indexes[0]);
-      cuda_stream_add_callback(streams[0], gpu_indexes[0],
-                               host_free_on_stream_callback, h_lwe_indexes);

      lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

@@ -1007,6 +1107,8 @@ template <typename Torus> struct int_fullprop_buffer {
          small_vector_size, streams[0], gpu_indexes[0]);
      tmp_big_lwe_vector = (Torus *)cuda_malloc_async(
          big_vector_size, streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      free(h_lwe_indexes);
    }
  }

@@ -1014,6 +1116,7 @@ template <typename Torus> struct int_fullprop_buffer {
               uint32_t gpu_count) {

    lut->release(streams, gpu_indexes, 1);
+    delete lut;

    cuda_drop_async(tmp_small_lwe_vector, streams[0], gpu_indexes[0]);
    cuda_drop_async(tmp_big_lwe_vector, streams[0], gpu_indexes[0]);
@@ -1135,7 +1238,7 @@ template <typename Torus> struct int_sc_prop_memory {
  }
 };

-template <typename Torus> struct int_single_borrow_prop_memory {
+template <typename Torus> struct int_overflowing_sub_memory {
  Torus *generates_or_propagates;
  Torus *step_output;

@@ -1147,10 +1250,10 @@ template <typename Torus> struct int_single_borrow_prop_memory {

  int_radix_params params;

-  int_single_borrow_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
-                                uint32_t gpu_count, int_radix_params params,
-                                uint32_t num_radix_blocks,
-                                bool allocate_gpu_memory) {
+  int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count, int_radix_params params,
+                             uint32_t num_radix_blocks,
+                             bool allocate_gpu_memory) {
    this->params = params;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
@@ -1335,60 +1438,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
  }
 };

-template <typename Torus> struct int_overflowing_sub_memory {
-  int_radix_params params;
-  int_radix_lut<Torus> *luts_message_carry;
-  int_single_borrow_prop_memory<Torus> *borrow_prop_mem;
-  int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
-                             uint32_t gpu_count, int_radix_params params,
-                             uint32_t num_blocks, bool allocate_gpu_memory) {
-    this->params = params;
-    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;
-
-    borrow_prop_mem = new int_single_borrow_prop_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, num_blocks,
-        allocate_gpu_memory);
-
-    int max_pbs_count = num_blocks * 2;
-
-    // create lut object for message and carry
-    luts_message_carry =
-        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
-                                 max_pbs_count, allocate_gpu_memory);
-
-    auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
-    auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
-
-    // define functions for each accumulator
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    // generate accumulators
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], message_acc, params.glwe_dimension,
-        params.polynomial_size, message_modulus, carry_modulus, lut_f_message);
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], carry_acc, params.glwe_dimension,
-        params.polynomial_size, message_modulus, carry_modulus, lut_f_carry);
-
-    luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-  }
-
-  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
-               uint32_t gpu_count) {
-    luts_message_carry->release(streams, gpu_indexes, gpu_count);
-    borrow_prop_mem->release(streams, gpu_indexes, gpu_count);
-
-    delete luts_message_carry;
-    delete borrow_prop_mem;
-  }
-};
-
 template <typename Torus> struct int_mul_memory {
  Torus *vector_result_sb;
  Torus *block_mul_res;
@@ -1700,6 +1749,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

  cudaStream_t *local_streams_1;
  cudaStream_t *local_streams_2;
+  uint32_t active_gpu_count;

  int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
                                     uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -1707,12 +1757,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                     int_radix_params params,
                                     uint32_t num_radix_blocks,
                                     bool allocate_gpu_memory) {
+    active_gpu_count = get_active_gpu_count(1, gpu_count);
    // In the arithmetic shift, a PBS has to be applied to the last rotated
    // block twice: once to shift it, once to compute the padding block to be
    // copied onto all blocks to the left of the last rotated block
-    local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    local_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    local_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
      local_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
      local_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
    }
@@ -1723,12 +1776,12 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      uint32_t big_lwe_size = params.big_lwe_dimension + 1;
      uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

-      tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 2) *
+      tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 3) *
                                                   big_lwe_size_bytes,
                                               streams[0], gpu_indexes[0]);

      cuda_memset_async(tmp_rotated, 0,
-                        (num_radix_blocks + 2) * big_lwe_size_bytes, streams[0],
+                        (num_radix_blocks + 3) * big_lwe_size_bytes, streams[0],
                        gpu_indexes[0]);

      uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
@@ -1845,7 +1898,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
               uint32_t gpu_count) {
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
      cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
      cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
    }
@@ -1874,20 +1927,24 @@ template <typename Torus> struct int_zero_out_if_buffer {

  cudaStream_t *true_streams;
  cudaStream_t *false_streams;
+  uint32_t active_gpu_count;

  int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                         uint32_t gpu_count, int_radix_params params,
                         uint32_t num_radix_blocks, bool allocate_gpu_memory) {
    this->params = params;
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

    Torus big_size =
        (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
    if (allocate_gpu_memory) {
      tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
      // We may use a different stream to allow concurrent operation
-      true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < gpu_count; j++) {
+      true_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      false_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < active_gpu_count; j++) {
        true_streams[j] = cuda_create_stream(gpu_indexes[j]);
        false_streams[j] = cuda_create_stream(gpu_indexes[j]);
      }
@@ -1896,7 +1953,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
               uint32_t gpu_count) {
    cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
      cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
      cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
    }
@@ -2046,6 +2103,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
               uint32_t gpu_count) {
    for (auto &lut : is_equal_to_lut_map) {
      lut.second->release(streams, gpu_indexes, gpu_count);
+      delete (lut.second);
    }
    is_equal_to_lut_map.clear();

@@ -2192,13 +2250,11 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
                                   num_radix_blocks, allocate_gpu_memory);

-      tree_last_leaf_lut =
-          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
-                                   num_radix_blocks, allocate_gpu_memory);
+      tree_last_leaf_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);

-      tree_last_leaf_scalar_lut =
-          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
-                                   num_radix_blocks, allocate_gpu_memory);
+      tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
      generate_device_accumulator_bivariate<Torus>(
          streams[0], gpu_indexes[0],
          tree_inner_leaf_lut->get_lut(gpu_indexes[0], 0),
@@ -2330,6 +2386,7 @@ template <typename Torus> struct int_comparison_buffer {
  int_radix_lut<Torus> *signed_msb_lut;
  cudaStream_t *lsb_streams;
  cudaStream_t *msb_streams;
+  uint32_t active_gpu_count;

  int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                        uint32_t gpu_count, COMPARISON_TYPE op,
@@ -2339,14 +2396,18 @@ template <typename Torus> struct int_comparison_buffer {
    this->op = op;
    this->is_signed = is_signed;

+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
    identity_lut_f = [](Torus x) -> Torus { return x; };

    auto big_lwe_size = params.big_lwe_dimension + 1;

    if (allocate_gpu_memory) {
-      lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < gpu_count; j++) {
+      lsb_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      msb_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < active_gpu_count; j++) {
        lsb_streams[j] = cuda_create_stream(gpu_indexes[j]);
        msb_streams[j] = cuda_create_stream(gpu_indexes[j]);
      }
@@ -2510,7 +2571,7 @@ template <typename Torus> struct int_comparison_buffer {
      signed_msb_lut->release(streams, gpu_indexes, gpu_count);
      delete (signed_msb_lut);
    }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
      cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
    }
@@ -2521,6 +2582,7 @@ template <typename Torus> struct int_comparison_buffer {

 template <typename Torus> struct int_div_rem_memory {
  int_radix_params params;
+  uint32_t active_gpu_count;

  // memory objects for other operations
  int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
@@ -2627,7 +2689,7 @@ template <typename Torus> struct int_div_rem_memory {
          [shifted_mask](Torus x) -> Torus { return x & shifted_mask; };

      masking_luts_1[i] = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true);
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);

@@ -2741,7 +2803,7 @@ template <typename Torus> struct int_div_rem_memory {
      };

      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true);

      generate_device_accumulator_bivariate<Torus>(
          streams[0], gpu_indexes[0],
@@ -2756,6 +2818,8 @@ template <typename Torus> struct int_div_rem_memory {
  int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
                     uint32_t gpu_count, int_radix_params params,
                     uint32_t num_blocks, bool allocate_gpu_memory) {
+    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
+
    this->params = params;
    shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
        streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
@@ -2775,11 +2839,15 @@ template <typename Torus> struct int_div_rem_memory {
    init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
    init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);

-    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_3 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_4 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
      sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
@@ -2850,7 +2918,7 @@ template <typename Torus> struct int_div_rem_memory {
    delete[] merge_overflow_flags_luts;

    // release sub streams
-    for (uint i = 0; i < gpu_count; i++) {
+    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
      cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
@@ -2884,6 +2952,247 @@ template <typename Torus> struct int_div_rem_memory {
  }
 };

+template <typename Torus> struct int_last_block_inner_propagate_memory {
+
+  int_radix_lut<Torus> *last_block_inner_propagation_lut;
+  int_radix_params params;
+
+  int_last_block_inner_propagate_memory(
+      cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+      int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks,
+      bool allocate_gpu_memory) {
+
+    this->params = params;
+    auto message_modulus = params.message_modulus;
+    uint32_t bits_of_message =
+        static_cast<uint32_t>(std::log2(params.message_modulus));
+    Torus message_bit_mask = (1 << bits_of_message) - 1;
+
+    // declare lambda function for last_block_inner_propagation_lut generation
+    auto f_last_block_inner_propagation_lut =
+        [op, message_modulus, message_bit_mask,
+         bits_of_message](Torus lhs_block, Torus rhs_block) -> Torus {
+      uint64_t rhs_block_modified;
+      if (op == SIGNED_OPERATION::SUBTRACTION) {
+        // Subtraction is done by adding the negation
+        // Negation(x) = bit_flip(x) + 1
+        // Only add the flipped value, the +1 will be resolved by carry
+        // propagation computation
+        uint64_t flipped_rhs = ~rhs_block;
+
+        // Remove the last bit, it's not interesting in this step
+        rhs_block_modified = (flipped_rhs << 1) & message_bit_mask;
+      } else {
+        rhs_block_modified = (rhs_block << 1) & message_bit_mask;
+      }
+
+      uint64_t lhs_block_modified = (lhs_block << 1) & message_bit_mask;
+
+      // whole_result contains the result of addition with
+      // the carry being in the first bit of carry space
+      // the message space contains the message, but with one 0
+      // on the right (LSB)
+      uint64_t whole_result = lhs_block_modified + rhs_block_modified;
+      uint64_t carry = whole_result >> bits_of_message;
+      uint64_t result = (whole_result & message_bit_mask) >> 1;
+      OUTPUT_CARRY propagation_result;
+      if (carry == 1) {
+        // Addition of bits before the last one generates a carry
+        propagation_result = OUTPUT_CARRY::GENERATED;
+      } else if (result == ((message_modulus - 1) >> 1)) {
+        // Addition of bits before the last one puts the bits
+        // in a state that makes it so that an input carry into the last block
+        // gets propagated to the last bit.
+        propagation_result = OUTPUT_CARRY::PROPAGATED;
+      } else {
+        propagation_result = OUTPUT_CARRY::NONE;
+      }
+
+      // Shift the propagation result in the carry part
+      // to have less noise growth later
+      return (static_cast<uint64_t>(propagation_result) << bits_of_message);
+    };
+
+    last_block_inner_propagation_lut = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams[0], gpu_indexes[0],
+        last_block_inner_propagation_lut->get_lut(gpu_indexes[0], 0),
+        params.glwe_dimension, params.polynomial_size, message_modulus,
+        params.carry_modulus, f_last_block_inner_propagation_lut);
+    last_block_inner_propagation_lut->broadcast_lut(streams, gpu_indexes,
+                                                    gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count);
+    delete last_block_inner_propagation_lut;
+  }
+};
+
+template <typename Torus> struct int_resolve_signed_overflow_memory {
+
+  int_radix_lut<Torus> *resolve_overflow_lut;
+  int_radix_params params;
+
+  Torus *x;
+
+  int_resolve_signed_overflow_memory(cudaStream_t *streams,
+                                     uint32_t *gpu_indexes, uint32_t gpu_count,
+                                     int_radix_params params,
+                                     bool allocate_gpu_memory) {
+
+    this->params = params;
+
+    auto message_modulus = params.message_modulus;
+
+    uint32_t bits_of_message =
+        static_cast<uint32_t>(std::log2(message_modulus));
+
+    x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
+                                       sizeof(Torus),
+                                   streams[0], gpu_indexes[0]);
+
+    // declare lambda function for resolve_overflow_lut generation
+    auto f_resolve_overflow_lut = [bits_of_message](Torus x) -> Torus {
+      Torus carry_propagation = x >> bits_of_message;
+      Torus output_carry_of_block = (x >> 1) & 1;
+      Torus input_carry_of_block = x & 1;
+
+      // Resolve the carry that the last bit actually receives as input
+      Torus input_carry_to_last_bit;
+      if (carry_propagation == OUTPUT_CARRY::PROPAGATED) {
+        input_carry_to_last_bit = input_carry_of_block;
+      } else if (carry_propagation == OUTPUT_CARRY::GENERATED) {
+        input_carry_to_last_bit = 1;
+      } else {
+        input_carry_to_last_bit = 0;
+      };
+
+      return input_carry_to_last_bit != output_carry_of_block;
+    };
+
+    resolve_overflow_lut = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0],
+        resolve_overflow_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
+        params.polynomial_size, message_modulus, params.carry_modulus,
+        f_resolve_overflow_lut);
+    resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    resolve_overflow_lut->release(streams, gpu_indexes, gpu_count);
+    delete resolve_overflow_lut;
+    cuda_drop_async(x, streams[0], gpu_indexes[0]);
+  }
+};
+
+template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
+  int_radix_params params;
+  uint32_t active_gpu_count;
+
+  // memory objects for other operations
+  int_sc_prop_memory<Torus> *scp_mem;
+  int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
+  int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
+
+  // sub streams
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
+
+  // temporary device buffers
+  Torus *result;                       // num_blocks
+  Torus *input_carries;                // num_blocks
+  Torus *neg_rhs;                      // num_blocks
+  Torus *output_carry;                 // single block
+  Torus *last_block_inner_propagation; // single block
+
+  // allocate temporary arrays used to calculate
+  // cuda integer signed overflowing add or sub
+  void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count, uint32_t num_blocks) {
+    uint32_t big_lwe_size = params.big_lwe_dimension + 1;
+
+    result = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+    neg_rhs = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+    input_carries = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+    output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus),
+                                              streams[0], gpu_indexes[0]);
+    last_block_inner_propagation = (Torus *)cuda_malloc_async(
+        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+  }
+
+  // constructor without memory reuse
+  int_signed_overflowing_add_or_sub_memory(
+      cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+      int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
+      bool allocate_gpu_memory) {
+    this->params = params;
+    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
+
+    allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
+                               num_blocks);
+
+    // initialize streams
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+
+    // initialize memory objects for other operations
+    scp_mem =
+        new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                      num_blocks, allocate_gpu_memory);
+    las_block_prop_mem = new int_last_block_inner_propagate_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, op, num_blocks,
+        allocate_gpu_memory);
+
+    resolve_overflow_mem = new int_resolve_signed_overflow_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    // memory objects for other operations
+    scp_mem->release(streams, gpu_indexes, gpu_count);
+    las_block_prop_mem->release(streams, gpu_indexes, gpu_count);
+    resolve_overflow_mem->release(streams, gpu_indexes, gpu_count);
+
+    delete scp_mem;
+    delete las_block_prop_mem;
+    delete resolve_overflow_mem;
+
+    // temporary device buffers
+    cuda_drop_async(result, streams[0], gpu_indexes[0]);
+    cuda_drop_async(neg_rhs, streams[0], gpu_indexes[0]);
+    cuda_drop_async(input_carries, streams[0], gpu_indexes[0]);
+    cuda_drop_async(output_carry, streams[0], gpu_indexes[0]);
+    cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
+
+    // sub streams
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
+      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
+    }
+    free(sub_streams_1);
+    free(sub_streams_2);
+  }
+};
 template <typename Torus> struct int_bitop_buffer {

  int_radix_params params;
@@ -2951,8 +3260,6 @@ template <typename Torus> struct int_bitop_buffer {
        lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
      }
    }
-
-    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
  }

  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -2981,8 +3288,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
      size_t num_ciphertext_bits = msg_bits * num_radix_blocks;

      //// Contains all shifted values of lhs for shift in range (0..msg_bits)
-      //// The idea is that with these we can create all other shift that are in
-      //// range (0..total_bits) for free (block rotation)
+      //// The idea is that with these we can create all other shift that are
+      /// in / range (0..total_bits) for free (block rotation)
      preshifted_buffer = (Torus *)cuda_malloc_async(
          num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]);

@@ -3010,9 +3317,8 @@ template <typename Torus> struct int_scalar_mul_buffer {

  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
               uint32_t gpu_count) {
-    logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
    sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
-    cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+    delete sum_ciphertexts_vec_mem;
    cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]);
  }
 };
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -9,15 +9,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset = 0);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset = 0);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -26,14 +26,12 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
 void scratch_cuda_programmable_bootstrap_amortized_32(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 void scratch_cuda_programmable_bootstrap_amortized_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -41,8 +39,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t num_samples);

 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -50,8 +47,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t num_samples);

 void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
                                                   uint32_t gpu_index,
@@ -60,14 +56,12 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
 void scratch_cuda_programmable_bootstrap_32(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 void scratch_cuda_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -75,8 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t num_samples);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -84,44 +77,41 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t num_samples);

 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);

 uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+    uint32_t input_lwe_ciphertext_count);

 uint64_t get_buffer_size_programmable_bootstrap_64(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+    uint32_t input_lwe_ciphertext_count);
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_programmable_bootstrap_step_one(
+uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
         sizeof(double2) * polynomial_size / 2; // accumulator fft
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_programmable_bootstrap_step_two(
+uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size +      // accumulator
         sizeof(double2) * polynomial_size / 2; // accumulator fft
 }

 template <typename Torus>
-__host__ __device__ uint64_t
+uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft
 }

 template <typename Torus>
-__host__ __device__ uint64_t
+uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
         sizeof(Torus) * polynomial_size +      // accumulator
@@ -129,21 +119,19 @@ get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_programmable_bootstrap_tbc(
+uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
    uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
+uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
    uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // tbc
 }

 template <typename Torus>
-__host__ __device__ uint64_t
+uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
         sizeof(Torus) * polynomial_size +      // accumulator
@@ -151,15 +139,14 @@ get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
 }

 template <typename Torus>
-__host__ __device__ uint64_t
+uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
 }

 template <typename Torus>
-__host__ bool
-supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory);
+bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
+    uint32_t polynomial_size);

 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

@@ -178,7 +165,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

    this->pbs_variant = pbs_variant;

-    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    auto max_shared_memory = cuda_get_max_shared_memory(0);

    if (allocate_gpu_memory) {
      switch (pbs_variant) {
@@ -255,7 +242,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

        bool supports_dsm =
            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size, max_shared_memory);
+                Torus>(polynomial_size);

        uint64_t full_sm =
            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -314,10 +301,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 };

 template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
+uint64_t get_buffer_size_programmable_bootstrap_cg(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
+    uint32_t input_lwe_ciphertext_count) {
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
  uint64_t partial_sm =
@@ -343,8 +330,7 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   uint32_t max_shared_memory);
+                                                   uint32_t num_samples);

 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
@@ -353,8 +339,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t level_count, uint32_t num_samples);

 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -363,8 +348,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t level_count, uint32_t num_samples);

 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -374,43 +358,44 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t level_count, uint32_t num_samples);

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 #endif

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
                                                    uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
-                                                    uint32_t level_count,
-                                                    uint32_t max_shared_memory);
+                                                    uint32_t level_count);

 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                         int glwe_dimension,
                                         uint32_t level_count);

+template <typename T>
+__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
+                                           int level, uint32_t polynomial_size,
+                                           int glwe_dimension,
+                                           uint32_t level_count);
+
 template <typename T>
 __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
@@ -422,8 +407,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     int glwe_dimension, uint32_t level_count);

 template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {

 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);

 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void *src,
@@ -19,8 +19,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -28,9 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t level_count, uint32_t num_samples);

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
@@ -38,23 +35,21 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 }

 template <typename Torus>
-__host__ bool
-supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory);
+bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);

 #if CUDA_ARCH >= 900
-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -63,25 +58,14 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 #endif

-template <typename Torus, typename STorus>
-void scratch_cuda_cg_multi_bit_programmable_bootstrap(
-    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
-
-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -90,17 +74,14 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -109,45 +90,34 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
+uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size);
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size);

 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
@@ -317,8 +287,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
 };

 template <typename Torus, class params>
-__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                                     uint32_t polynomial_size,
-                                     uint32_t max_shared_memory);
+uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                            uint32_t polynomial_size);

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1,4 +1,5 @@
 #include "ciphertext.cuh"
+#include "polynomial/parameters.cuh"

 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
@@ -19,3 +20,58 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
      static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
      (uint64_t *)src, number_of_cts, lwe_dimension);
 }
+
+void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
+                                 void *lwe_array_out, void *glwe_array_in,
+                                 uint32_t *nth_array, uint32_t num_glwes,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_sample_extract<uint64_t, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 512:
+    host_sample_extract<uint64_t, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 1024:
+    host_sample_extract<uint64_t, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 2048:
+    host_sample_extract<uint64_t, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 4096:
+    host_sample_extract<uint64_t, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 8192:
+    host_sample_extract<uint64_t, AmortizedDegree<8192>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  case 16384:
+    host_sample_extract<uint64_t, AmortizedDegree<16384>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        glwe_dimension);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  }
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -3,6 +3,7 @@

 #include "ciphertext.h"
 #include "device.h"
+#include "polynomial/functions.cuh"
 #include <cstdint>

 template <typename T>
@@ -25,4 +26,39 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }

+template <typename Torus, class params>
+__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
+                               uint32_t *nth_array, uint32_t glwe_dimension) {
+
+  const int input_id = blockIdx.x;
+
+  const int glwe_input_size = (glwe_dimension + 1) * params::degree;
+  const int lwe_output_size = glwe_dimension * params::degree + 1;
+
+  auto lwe_out = lwe_array_out + input_id * lwe_output_size;
+
+  // We assume each GLWE will store the first polynomial_size inputs
+  uint32_t nth_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
+
+  auto nth = nth_array[input_id];
+
+  sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+  sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
+}
+
+template <typename Torus, class params>
+__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
+                                  Torus *lwe_array_out, Torus *glwe_array_in,
+                                  uint32_t *nth_array, uint32_t num_glwes,
+                                  uint32_t glwe_dimension) {
+  cudaSetDevice(gpu_index);
+
+  dim3 grid(num_glwes);
+  dim3 thds(params::degree / params::opt);
+  sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
+      lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -9,16 +9,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
  cuda_keyswitch_lwe_ciphertext_vector(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
      static_cast<uint32_t *>(lwe_array_in),
      static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
-      gpu_offset);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }

 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -41,14 +39,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
  cuda_keyswitch_lwe_ciphertext_vector(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
      static_cast<uint64_t *>(lwe_array_in),
      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
-      gpu_offset);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -38,26 +38,25 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 // threads in y are used to paralelize the lwe_dimension_in loop.
 // shared memory is used to store intermediate results of the reduction.
 template <typename Torus>
-__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
-                          Torus *lwe_array_in, Torus *lwe_input_indexes,
-                          Torus *ksk, uint32_t lwe_dimension_in,
-                          uint32_t lwe_dimension_out, uint32_t base_log,
-                          uint32_t level_count, int gpu_offset) {
+__global__ void
+keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+          const Torus *__restrict__ lwe_array_in,
+          const Torus *__restrict__ lwe_input_indexes,
+          const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
+          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;

  extern __shared__ int8_t sharedmem[];
  Torus *lwe_acc_out = (Torus *)sharedmem;
-  auto block_lwe_array_out =
-      get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
-                lwe_dimension_out + 1);
+  auto block_lwe_array_out = get_chunk(
+      lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1);

  if (tid <= lwe_dimension_out) {

    Torus local_lwe_out = 0;
-    auto block_lwe_array_in =
-        get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
-                  lwe_dimension_in + 1);
+    auto block_lwe_array_in = get_chunk(
+        lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1);

    if (tid == lwe_dimension_out && threadIdx.y == 0) {
      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
@@ -103,8 +102,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset = 0) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  cudaSetDevice(gpu_index);

@@ -120,42 +118,40 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(

  keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count);
  check_cuda_error(cudaGetLastError());
 }

 template <typename Torus>
-void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
-                       uint32_t gpu_count, Torus *lwe_array_out,
-                       Torus *lwe_output_indexes, Torus *lwe_array_in,
-                       Torus *lwe_input_indexes, Torus **ksks,
-                       uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-                       uint32_t base_log, uint32_t level_count,
-                       uint32_t num_samples, bool sync_streams = true) {
+void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count,
+                             const LweArrayVariant<Torus> &lwe_array_out,
+                             const LweArrayVariant<Torus> &lwe_output_indexes,
+                             const LweArrayVariant<Torus> &lwe_array_in,
+                             const LweArrayVariant<Torus> &lwe_input_indexes,
+                             Torus **ksks, uint32_t lwe_dimension_in,
+                             uint32_t lwe_dimension_out, uint32_t base_log,
+                             uint32_t level_count, uint32_t num_samples) {

  /// If the number of radix blocks is lower than the number of GPUs, not all
  /// GPUs will be active and there will be 1 input per GPU
-  auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
-  int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
-  if (sync_streams)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-#pragma omp parallel for num_threads(active_gpu_count)
-  for (uint i = 0; i < active_gpu_count; i++) {
+  for (uint i = 0; i < gpu_count; i++) {
    int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
-    int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
+
+    Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+    Torus *current_lwe_output_indexes =
+        GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+    Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+    Torus *current_lwe_input_indexes =
+        GET_VARIANT_ELEMENT(lwe_input_indexes, i);

    // Compute Keyswitch
    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
-        streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-        lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
-        lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
-        gpu_offset);
+        streams[i], gpu_indexes[i], current_lwe_array_out,
+        current_lwe_output_indexes, current_lwe_array_in,
+        current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
+        base_log, level_count, num_samples_on_gpu);
  }
-
-  if (sync_streams)
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -39,36 +39,19 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
 }

 template <typename T>
-__device__ __forceinline__ void rescale_torus_element(T element, T &output,
-                                                      uint32_t log_shift) {
-  output =
-      round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-            (double)log_shift);
+__device__ __forceinline__ void modulus_switch(T input, T &output,
+                                               uint32_t log_modulus) {
+  constexpr uint32_t BITS = sizeof(T) * 8;
+
+  output = input + (((T)1) << (BITS - log_modulus - 1));
+  output >>= (BITS - log_modulus);
 }

 template <typename T>
-__device__ __forceinline__ T rescale_torus_element(T element,
-                                                   uint32_t log_shift) {
-  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-               (double)log_shift);
+__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
+  T output;
+  modulus_switch(input, output, log_modulus);
+  return output;
 }

-template <>
-__device__ __forceinline__ void
-rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
-                                uint32_t log_shift) {
-  output =
-      round(__uint2double_rn(element) /
-            (__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
-            __uint2double_rn(log_shift));
-}
-
-template <>
-__device__ __forceinline__ void
-rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
-                                uint32_t log_shift) {
-  output = round(__ull2double_rn(element) /
-                 (__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
-                 __uint2double_rn(log_shift));
-}
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -166,19 +166,21 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                          Torus *d_array, Torus value, Torus n) {
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
-  if (attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda set value.")
-  }
-  check_cuda_error(cudaSetDevice(gpu_index));
-  int block_size = 256;
-  int num_blocks = (n + block_size - 1) / block_size;
+  if (n > 0) {
+    cudaPointerAttributes attr;
+    check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
+    if (attr.type != cudaMemoryTypeDevice) {
+      PANIC("Cuda error: invalid dest device pointer in cuda set value.")
+    }
+    check_cuda_error(cudaSetDevice(gpu_index));
+    int block_size = 256;
+    int num_blocks = (n + block_size - 1) / block_size;

-  // Launch the kernel
-  cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
-                                                               n);
-  check_cuda_error(cudaGetLastError());
+    // Launch the kernel
+    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
+                                                                 n);
+    check_cuda_error(cudaGetLastError());
+  }
 }

 /// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
@@ -241,22 +243,18 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

 /// Get the maximum size for the shared memory
 int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
  int max_shared_memory = 0;
  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                         gpu_index);
  check_cuda_error(cudaGetLastError());
+#if CUDA_ARCH == 900
+  max_shared_memory = 226000;
+#elif CUDA_ARCH == 890
+  max_shared_memory = 127000;
+#elif CUDA_ARCH == 800
+  max_shared_memory = 163000;
+#elif CUDA_ARCH == 700
+  max_shared_memory = 95000;
+#endif
  return max_shared_memory;
 }
-
-void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
-                              cudaStreamCallback_t callback, void *user_data) {
-
-  check_cuda_error(cudaSetDevice(gpu_index));
-  check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
-}
-
-void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
-                                  void *host_pointer) {
-  free(host_pointer);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
@@ -0,0 +1,49 @@
+#include "integer/addition.cuh"
+
+void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
+                                                : SIGNED_OPERATION::SUBTRACTION;
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
+      num_blocks, op, params, allocate_gpu_memory);
+}
+
+void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
+    void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks) {
+
+  auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
+  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
+                                                : SIGNED_OPERATION::SUBTRACTION;
+
+  host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
+      static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
+      num_blocks);
+}
+
+void cleanup_signed_overflowing_add_or_sub(void **streams,
+                                           uint32_t *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void) {
+  int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
+      (int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -0,0 +1,137 @@
+#ifndef TFHE_RS_ADDITION_CUH
+#define TFHE_RS_ADDITION_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename Torus>
+void host_resolve_signed_overflow(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *result, Torus *last_block_inner_propagation,
+    Torus *last_block_input_carry, Torus *last_block_output_carry,
+    int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {
+
+  auto x = mem->x;
+
+  Torus *d_clears =
+      (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
+
+  cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
+
+  // replace with host function call
+  cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
+      mem->params.big_lwe_dimension, 1);
+
+  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
+                1);
+  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                last_block_inner_propagation, last_block_input_carry,
+                mem->params.big_lwe_dimension, 1);
+
+  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
+                                      last_block_inner_propagation,
+                                      mem->resolve_overflow_lut, ksks, bsks, 1);
+
+  cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
+    uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_blocks, op,
+      allocate_gpu_memory);
+}
+
+/*
+ * Addition - signed_operation = 1
+ * Subtraction - signed_operation = -1
+ */
+template <typename Torus>
+__host__ void host_integer_signed_overflowing_add_or_sub_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
+    uint64_t **ksks,
+    int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
+    uint32_t num_blocks) {
+
+  auto radix_params = mem_ptr->params;
+
+  uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
+  uint32_t big_lwe_size = big_lwe_dimension + 1;
+  uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
+
+  auto result = mem_ptr->result;
+  auto neg_rhs = mem_ptr->neg_rhs;
+  auto input_carries = mem_ptr->input_carries;
+  auto output_carry = mem_ptr->output_carry;
+  auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
+
+  cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
+                               streams[0], gpu_indexes[0]);
+
+  // phase 1
+  if (op == SIGNED_OPERATION::ADDITION) {
+    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
+                  big_lwe_dimension, num_blocks);
+  } else {
+    host_integer_radix_negation(
+        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
+        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
+    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                  big_lwe_dimension, num_blocks);
+  }
+
+  // phase 2
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
+
+  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+                              result, output_carry, input_carries,
+                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation(
+      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
+      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
+      ksks);
+
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+  }
+
+  // phase 3
+  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
+
+  host_resolve_signed_overflow(
+      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
+      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
+
+  cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
+                               streams[0], gpu_indexes[0]);
+}
+
+#endif // TFHE_RS_ADDITION_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -2,7 +2,6 @@
 #define CUDA_INTEGER_CMUX_CUH

 #include "integer.cuh"
-#include <omp.h>

 template <typename Torus>
 __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -57,27 +56,18 @@ __host__ void host_integer_radix_cmux_kb(
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }

-#pragma omp parallel sections
-  {
-    // Both sections may be executed in parallel
-#pragma omp section
-    {
-      auto mem_true = mem_ptr->zero_if_true_buffer;
-      zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-                  lwe_array_true, lwe_condition, mem_true,
-                  mem_ptr->inverted_predicate_lut, bsks, ksks,
-                  num_radix_blocks);
-    }
-#pragma omp section
-    {
-      auto mem_false = mem_ptr->zero_if_false_buffer;
-      zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-                  lwe_array_false, lwe_condition, mem_false,
-                  mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
-    }
-  }
-  for (uint j = 0; j < gpu_count; j++) {
+  auto mem_true = mem_ptr->zero_if_true_buffer;
+  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+              lwe_array_true, lwe_condition, mem_true,
+              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  auto mem_false = mem_ptr->zero_if_false_buffer;
+  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
+              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
+              bsks, ksks, num_radix_blocks);
+  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+  }
+  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
    cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
  }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -26,54 +26,11 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(

  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

-  switch (mem->params.polynomial_size) {
-  case 512:
-    host_integer_div_rem_kb<uint64_t, Degree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsks, (uint64_t **)(ksks), mem, num_blocks);
-    break;
-  case 1024:
-
-    host_integer_div_rem_kb<uint64_t, Degree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsks, (uint64_t **)(ksks), mem, num_blocks);
-    break;
-  case 2048:
-    host_integer_div_rem_kb<uint64_t, Degree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsks, (uint64_t **)(ksks), mem, num_blocks);
-    break;
-  case 4096:
-    host_integer_div_rem_kb<uint64_t, Degree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsks, (uint64_t **)(ksks), mem, num_blocks);
-    break;
-  case 8192:
-    host_integer_div_rem_kb<uint64_t, Degree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsks, (uint64_t **)(ksks), mem, num_blocks);
-    break;
-  case 16384:
-    host_integer_div_rem_kb<uint64_t, Degree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
-        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsks, (uint64_t **)(ksks), mem, num_blocks);
-    break;
-  default:
-    PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
-  }
+  host_integer_div_rem_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+      static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+      bsks, (uint64_t **)(ksks), mem, num_blocks);
 }

 void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -14,7 +14,6 @@
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
 #include <iostream>
-#include <omp.h>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -31,17 +30,13 @@ template <typename Torus> struct lwe_ciphertext_list {
  int_radix_params params;

  size_t big_lwe_size;
-  size_t radix_size;
  size_t big_lwe_size_bytes;
-  size_t radix_size_bytes;
  size_t big_lwe_dimension;

  lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
      : data(src), params(params), max_blocks(max_blocks) {
    big_lwe_size = params.big_lwe_dimension + 1;
    big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-    radix_size = max_blocks * big_lwe_size;
-    radix_size_bytes = radix_size * sizeof(Torus);
    big_lwe_dimension = params.big_lwe_dimension;
    len = max_blocks;
  }
@@ -173,7 +168,7 @@ __host__ void scratch_cuda_integer_div_rem_kb(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }

-template <typename Torus, class params>
+template <typename Torus>
 __host__ void
 host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                        uint32_t gpu_count, Torus *quotient, Torus *remainder,
@@ -376,35 +371,19 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    for (uint j = 0; j < gpu_count; j++) {
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
-#pragma omp parallel sections
-    {
-#pragma omp section
-      {
-        // interesting_divisor
-        trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
-                                           gpu_count);
-      }
-#pragma omp section
-      {
-        // divisor_ms_blocks
-        trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
-                                   gpu_count);
-      }
-#pragma omp section
-      {
-        // interesting_remainder1
-        // numerator_block_stack
-        left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
-                                          gpu_count);
-      }
-#pragma omp section
-      {
-        // interesting_remainder2
-        left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
-                                          gpu_count);
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
+    // interesting_divisor
+    trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
+                                       gpu_count);
+    // divisor_ms_blocks
+    trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
+    // interesting_remainder1
+    // numerator_block_stack
+    left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
+                                      gpu_count);
+    // interesting_remainder2
+    left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
+                                      gpu_count);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -439,7 +418,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    //  `subtraction_overflowed` - single ciphertext
    auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t gpu_count) {
-      host_integer_overflowing_sub_kb<Torus, params>(
+      host_integer_overflowing_sub_kb<Torus>(
          streams, gpu_indexes, gpu_count, new_remainder.data,
          subtraction_overflowed.data, merged_interesting_remainder.data,
          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
@@ -493,28 +472,15 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    for (uint j = 0; j < gpu_count; j++) {
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
-#pragma omp parallel sections
-    {
-#pragma omp section
-      {
-        // new_remainder
-        // subtraction_overflowed
-        do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
-      }
-#pragma omp section
-      {
-        // at_least_one_upper_block_is_non_zero
-        check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
-                                   gpu_count);
-      }
-#pragma omp section
-      {
-        // cleaned_merged_interesting_remainder
-        create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
-                                                 gpu_indexes, gpu_count);
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
+    // new_remainder
+    // subtraction_overflowed
+    do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
+    // at_least_one_upper_block_is_non_zero
+    check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
+    // cleaned_merged_interesting_remainder
+    create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
+                                             gpu_indexes, gpu_count);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -571,27 +537,15 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    for (uint j = 0; j < gpu_count; j++) {
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
-#pragma omp parallel sections
-    {
-#pragma omp section
-      {
-        // cleaned_merged_interesting_remainder
-        conditionally_zero_out_merged_interesting_remainder(
-            mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
-      }
-#pragma omp section
-      {
-        // new_remainder
-        conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
-                                                    gpu_indexes, gpu_count);
-      }
-#pragma omp section
-      {
-        // quotient
-        set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
+    // cleaned_merged_interesting_remainder
+    conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
+                                                        gpu_indexes, gpu_count);
+    // new_remainder
+    conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
+                                                gpu_indexes, gpu_count);
+    // quotient
+    set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -617,22 +571,13 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-#pragma omp parallel sections
-  {
-#pragma omp section
-    {
-      integer_radix_apply_univariate_lookup_table_kb(
-          mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
-          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-    }
-#pragma omp section
-    {
-      integer_radix_apply_univariate_lookup_table_kb(
-          mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
-          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
-    }
-  }
-  for (uint j = 0; j < gpu_count; j++) {
+  integer_radix_apply_univariate_lookup_table_kb(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
+      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
+  integer_radix_apply_univariate_lookup_table_kb(
+      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
+      ksks, num_blocks, mem_ptr->message_extract_lut_2);
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -19,9 +19,8 @@ void scratch_cuda_full_propagation_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -29,8 +28,7 @@ void scratch_cuda_full_propagation_64(

  scratch_cuda_full_propagation<uint64_t>(
      (cudaStream_t *)streams, gpu_indexes, gpu_count,
-      (int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
-      allocate_gpu_memory);
+      (int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
 }

 void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
@@ -175,3 +173,55 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
+}
+
+void cuda_integer_compute_prefix_sum_hillis_steele_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t shift) {
+
+  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
+
+  host_compute_prefix_sum_hillis_steele<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe), params,
+      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks);
+}
+
+void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void cuda_integer_reverse_blocks_64_inplace(void **streams,
+                                            uint32_t *gpu_indexes,
+                                            uint32_t gpu_count, void *lwe_array,
+                                            uint32_t num_blocks,
+                                            uint32_t lwe_size) {
+
+  host_radix_blocks_reverse_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes,
+      static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -3,6 +3,7 @@

 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "helper_multi_gpu.h"
 #include "integer.h"
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
@@ -10,6 +11,7 @@
 #include "polynomial/functions.cuh"
 #include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
+#include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <functional>

@@ -20,18 +22,19 @@ template <typename Torus>
 __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
                                          uint32_t value, uint32_t blocks_count,
                                          uint32_t lwe_size) {
-  value %= blocks_count;
-
  size_t tid = threadIdx.x;
-  size_t src_block_id = blockIdx.x;
-  size_t dst_block_id = (src_block_id + value) % blocks_count;
-  size_t stride = blockDim.x;
+  if (tid < lwe_size) {
+    value %= blocks_count;
+    size_t src_block_id = blockIdx.x;
+    size_t dst_block_id = (src_block_id + value) % blocks_count;
+    size_t stride = blockDim.x;

-  auto cur_src_block = &src[src_block_id * lwe_size];
-  auto cur_dst_block = &dst[dst_block_id * lwe_size];
+    auto cur_src_block = &src[src_block_id * lwe_size];
+    auto cur_dst_block = &dst[dst_block_id * lwe_size];

-  for (size_t i = tid; i < lwe_size; i += stride) {
-    cur_dst_block[i] = cur_src_block[i];
+    for (size_t i = tid; i < lwe_size; i += stride) {
+      cur_dst_block[i] = cur_src_block[i];
+    }
  }
 }

@@ -42,25 +45,28 @@ template <typename Torus>
 __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
                                         uint32_t blocks_count,
                                         uint32_t lwe_size) {
-  value %= blocks_count;
-  size_t src_block_id = blockIdx.x;
-
  size_t tid = threadIdx.x;
-  size_t dst_block_id = (src_block_id >= value)
-                            ? src_block_id - value
-                            : src_block_id - value + blocks_count;
-  size_t stride = blockDim.x;
+  if (tid < lwe_size) {
+    value %= blocks_count;
+    size_t src_block_id = blockIdx.x;

-  auto cur_src_block = &src[src_block_id * lwe_size];
-  auto cur_dst_block = &dst[dst_block_id * lwe_size];
+    size_t dst_block_id = (src_block_id >= value)
+                              ? src_block_id - value
+                              : src_block_id - value + blocks_count;
+    size_t stride = blockDim.x;

-  for (size_t i = tid; i < lwe_size; i += stride) {
-    cur_dst_block[i] = cur_src_block[i];
+    auto cur_src_block = &src[src_block_id * lwe_size];
+    auto cur_dst_block = &dst[dst_block_id * lwe_size];
+
+    for (size_t i = tid; i < lwe_size; i += stride) {
+      cur_dst_block[i] = cur_src_block[i];
+    }
  }
 }

 // rotate radix ciphertext right with specific value
 // calculation is not inplace, so `dst` and `src` must not be the same
+// one block is responsible to process single lwe ciphertext
 template <typename Torus>
 __host__ void
 host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -93,6 +99,35 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
      dst, src, value, blocks_count, lwe_size);
 }

+// reverse the blocks in a list
+// each cuda block swaps a couple of blocks
+template <typename Torus>
+__global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
+                                                 uint32_t blocks_count,
+                                                 uint32_t lwe_size) {
+
+  size_t idx = blockIdx.x;
+  size_t rev_idx = blocks_count - 1 - idx;
+
+  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+    Torus back_element = src[rev_idx * lwe_size + j];
+    Torus front_element = src[idx * lwe_size + j];
+    src[idx * lwe_size + j] = back_element;
+    src[rev_idx * lwe_size + j] = front_element;
+  }
+}
+
+template <typename Torus>
+__host__ void
+host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  Torus *src, uint32_t blocks_count,
+                                  uint32_t lwe_size) {
+  cudaSetDevice(gpu_indexes[0]);
+  int num_blocks = blocks_count / 2, num_threads = 1024;
+  radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
+      src, blocks_count, lwe_size);
+}
+
 // polynomial_size threads
 template <typename Torus>
 __global__ void
@@ -153,28 +188,67 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-  /// Apply KS to go from a big LWE dimension to a small LWE dimension
-  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
-                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
-                           lwe_array_in, lut->lwe_indexes_in, ksks,
-                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
-                           ks_level, num_radix_blocks, false);
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  execute_pbs<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
-      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
-      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
-      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-      grouping_factor, num_radix_blocks, 1, 0,
-      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
-
-  /// Synchronize all GPUs
  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                                   lwe_trivial_indexes_vec[0], lwe_array_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, pbs_type);
+  } else {
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
+                                   ksks, big_lwe_dimension, small_lwe_dimension,
+                                   ks_base_log, ks_level, num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
  }
 }

@@ -205,29 +279,63 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
                        num_radix_blocks);
  check_cuda_error(cudaGetLastError());

-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  /// Apply KS to go from a big LWE dimension to a small LWE dimension
-  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
-                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
-                           lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
-                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
-                           ks_level, num_radix_blocks, false);
-
-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  execute_pbs<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
-      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
-      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
-      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-      grouping_factor, num_radix_blocks, 1, 0,
-      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
-
-  /// Synchronize all GPUs
  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                                   lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, pbs_type);
+  } else {
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
+        lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
+                                   ksks, big_lwe_dimension, small_lwe_dimension,
+                                   ks_base_log, ks_level, num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
  }
 }

@@ -325,7 +433,6 @@ void generate_device_accumulator_bivariate(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
    uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {

-  cudaSetDevice(gpu_index);
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -335,14 +442,14 @@ void generate_device_accumulator_bivariate(
                                         message_modulus, carry_modulus, f);

  // copy host lut and lut_indexes_vec to device
+  cuda_synchronize_stream(stream, gpu_index);
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
                           stream, gpu_index);

-  // Release memory when possible
-  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                           h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }

 /*
@@ -358,7 +465,6 @@ void generate_device_accumulator_bivariate_with_factor(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
    uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {

-  cudaSetDevice(gpu_index);
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -368,15 +474,15 @@ void generate_device_accumulator_bivariate_with_factor(
      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
      factor);

+  cuda_synchronize_stream(stream, gpu_index);
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
                           stream, gpu_index);

-  // Release memory when possible
-  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                           h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }

 /*
@@ -394,7 +500,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                 uint32_t carry_modulus,
                                 std::function<Torus(Torus)> f) {

-  cudaSetDevice(gpu_index);
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -403,14 +508,14 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

+  cuda_synchronize_stream(stream, gpu_index);
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index);

-  // Release memory when possible
-  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                           h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }

 template <typename Torus>
@@ -424,6 +529,43 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
                                    num_radix_blocks, allocate_gpu_memory);
 }

+template <typename Torus>
+void host_compute_prefix_sum_hillis_steele(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
+    int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
+    uint32_t num_blocks) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  int num_steps = ceil(log2((double)num_blocks));
+  int space = 1;
+  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
+                               big_lwe_size_bytes * num_blocks, streams[0],
+                               gpu_indexes[0]);
+
+  for (int step = 0; step < num_steps; step++) {
+    if (space > num_blocks - 1)
+      PANIC("Cuda error: step output is going out of bounds in Hillis Steele "
+            "propagation")
+    auto cur_blocks = &step_output[space * big_lwe_size];
+    auto prev_blocks = generates_or_propagates;
+    int cur_total_blocks = num_blocks - space;
+
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
+        bsks, ksks, cur_total_blocks, luts, luts->params.message_modulus);
+
+    cuda_memcpy_async_gpu_to_gpu(
+        &generates_or_propagates[space * big_lwe_size], cur_blocks,
+        big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
+    space *= 2;
+  }
+}
+
 template <typename Torus>
 void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 uint32_t gpu_count, Torus *lwe_array,
@@ -448,29 +590,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
-
-  int num_steps = ceil(log2((double)num_blocks));
-  int space = 1;
-  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, streams[0],
-                               gpu_indexes[0]);
-
-  for (int step = 0; step < num_steps; step++) {
-    auto cur_blocks = &step_output[space * big_lwe_size];
-    auto prev_blocks = generates_or_propagates;
-    int cur_total_blocks = num_blocks - space;
-
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
-        bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
-        luts_carry_propagation_sum->params.message_modulus);
-
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_gpu_to_gpu(
-        &generates_or_propagates[space * big_lwe_size], cur_blocks,
-        big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
-    space *= 2;
-  }
+  host_compute_prefix_sum_hillis_steele(
+      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
+      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
                                 generates_or_propagates, 1, num_blocks,
@@ -496,11 +618,24 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
      num_blocks, message_acc);
 }

+template <typename Torus>
+void host_generate_last_block_inner_propagation(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
+    int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
+    Torus **ksks) {
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
+      bsks, ksks, 1, mem->last_block_inner_propagation_lut,
+      mem->params.message_modulus);
+}
+
 template <typename Torus>
 void host_propagate_single_sub_borrow(cudaStream_t *streams,
                                      uint32_t *gpu_indexes, uint32_t gpu_count,
                                      Torus *overflowed, Torus *lwe_array,
-                                      int_single_borrow_prop_memory<Torus> *mem,
+                                      int_overflowing_sub_memory<Torus> *mem,
                                      void **bsks, Torus **ksks,
                                      uint32_t num_blocks) {
  auto params = mem->params;
@@ -521,27 +656,9 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
-  int num_steps = ceil(log2((double)num_blocks));
-  int space = 1;
-  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, streams[0],
-                               gpu_indexes[0]);
-
-  for (int step = 0; step < num_steps; step++) {
-    auto cur_blocks = &step_output[space * big_lwe_size];
-    auto prev_blocks = generates_or_propagates;
-    int cur_total_blocks = num_blocks - space;
-
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
-        bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
-        luts_carry_propagation_sum->params.message_modulus);
-
-    cuda_memcpy_async_gpu_to_gpu(
-        &generates_or_propagates[space * big_lwe_size], cur_blocks,
-        big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
-    space *= 2;
-  }
+  host_compute_prefix_sum_hillis_steele<Torus>(
+      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
+      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

  cuda_memcpy_async_gpu_to_gpu(
      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
@@ -583,12 +700,11 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
  for (int i = 0; i < num_blocks; i++) {
    auto cur_input_block = &input_blocks[i * big_lwe_size];

-    cudaSetDevice(gpu_indexes[0]);
    /// Since the keyswitch is done on one input only, use only 1 GPU
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
-        streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
+    execute_keyswitch_async<Torus>(
+        streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
        mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
-        mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
+        mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
        params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);

    cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
@@ -596,15 +712,14 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 small_lwe_size * sizeof(Torus), streams[0],
                                 gpu_indexes[0]);

-    execute_pbs<Torus>(
+    execute_pbs_async<Torus>(
        streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
        mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
        mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
        params.glwe_dimension, params.small_lwe_dimension,
        params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, 2, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);
+        params.grouping_factor, 2, params.pbs_type);

    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
                                 big_lwe_size * sizeof(Torus), streams[0],
@@ -625,12 +740,10 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
                                   uint32_t gpu_count,
                                   int_fullprop_buffer<Torus> **mem_ptr,
                                   int_radix_params params,
-                                   uint32_t num_radix_blocks,
                                   bool allocate_gpu_memory) {

-  *mem_ptr =
-      new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
-                                     num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr = new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count,
+                                            params, allocate_gpu_memory);
 }

 // (lwe_dimension+1) threads
@@ -675,8 +788,9 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
                          Torus *lwe_array_out, Torus *lwe_array_in,
                          uint32_t lwe_dimension, uint32_t num_radix_blocks,
                          uint32_t factor) {
+  if (num_radix_blocks == 0)
+    return;
  cudaSetDevice(gpu_index);
-
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -71,7 +71,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
    uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
-    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          polynomial_size * glwe_dimension, lwe_dimension,
@@ -123,7 +123,6 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 * - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
 * ciphertext
 * - 'pbs_type' selects which PBS implementation should be used
- * - 'max_shared_memory' maximum shared memory per cuda block
 */
 void cuda_integer_mult_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -133,7 +132,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(

  switch (polynomial_size) {
  case 256:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -141,7 +140,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 512:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -149,7 +148,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 1024:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -157,7 +156,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 2048:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -165,7 +164,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 4096:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -173,7 +172,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 8192:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -181,7 +180,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 16384:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
+    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
@@ -203,7 +202,7 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

-void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
+void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -216,13 +215,13 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
                          grouping_factor, message_modulus, carry_modulus);
-  scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
+  scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
      max_num_radix_in_vec, params, allocate_gpu_memory);
 }

-void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
+void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
@@ -238,42 +237,47 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(

  switch (mem->params.polynomial_size) {
  case 512:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 1024:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 2048:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 4096:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 8192:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 16384:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
@@ -287,10 +291,9 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
  free(terms_degree);
 }

-void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
-                                                    uint32_t *gpu_indexes,
-                                                    uint32_t gpu_count,
-                                                    int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
  int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -8,11 +8,13 @@

 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "helper_multi_gpu.h"
 #include "integer.h"
 #include "integer/integer.cuh"
 #include "linear_algebra.h"
 #include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
+#include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
 #include <iostream>
@@ -91,15 +93,11 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-template <typename Torus, sharedMemDegree SMD>
+template <typename Torus>
 __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
                                uint32_t chunk_size, uint32_t block_size,
                                uint32_t num_blocks) {

-  extern __shared__ int8_t sharedmem[];
-
-  Torus *result = (Torus *)sharedmem;
-
  size_t stride = blockDim.x;
  size_t chunk_id = blockIdx.x;
  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
@@ -107,10 +105,7 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
  size_t block_stride = blockIdx.y * block_size;
-  auto dst_block = &dst_radix[block_stride];
-
-  if constexpr (SMD == NOSM)
-    result = dst_block;
+  auto result = &dst_radix[block_stride];

  // init shared mem with first radix of chunk
  size_t tid = threadIdx.x;
@@ -125,18 +120,12 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
      result[i] += cur_src_radix[block_stride + i];
    }
  }
-
-  // put result from shared mem to global mem
-  if constexpr (SMD == FULLSM)
-    for (int i = tid; i < block_size; i += stride)
-      dst_block[i] = result[i];
 }

 template <typename Torus, class params>
 __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
                                        Torus *msb_blocks,
                                        uint32_t glwe_dimension,
-                                        uint32_t lsb_count, uint32_t msb_count,
                                        uint32_t num_blocks) {
  size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
  size_t big_lwe_id = blockIdx.x;
@@ -180,38 +169,24 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
  }
 }
 template <typename Torus>
-__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
+__host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    int_radix_params params, bool allocate_gpu_memory) {

-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
-    check_cuda_error(cudaFuncSetAttribute(
-        tree_add_chunks<Torus, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
-                           cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(
-        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
-    check_cuda_error(cudaGetLastError());
-  }
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
      max_num_radix_in_vec, allocate_gpu_memory);
 }

 template <typename Torus, class params>
-__host__ void host_integer_sum_ciphertexts_vec_kb(
+__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {
+    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
+    int_radix_lut<Torus> *reused_lut = nullptr) {

  auto new_blocks = mem_ptr->new_blocks;
  auto old_blocks = mem_ptr->old_blocks;
@@ -223,11 +198,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
  auto message_modulus = mem_ptr->params.message_modulus;
  auto carry_modulus = mem_ptr->params.carry_modulus;
  auto num_blocks = num_blocks_in_radix;
-  auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
-  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
+  auto small_lwe_size = small_lwe_dimension + 1;

  if (old_blocks != terms) {
    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
@@ -246,7 +222,48 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
  int32_t h_smart_copy_in[r * num_blocks];
  int32_t h_smart_copy_out[r * num_blocks];

-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
+  /// Here it is important to query the default max shared memory on device 0
+  /// instead of cuda_get_max_shared_memory,
+  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
+  int max_shared_memory = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
+
+  // create lut object for message and carry
+  // we allocate luts_message_carry in the host function (instead of scratch)
+  // to reduce average memory consumption
+  int_radix_lut<Torus> *luts_message_carry;
+  size_t ch_amount = r / chunk_size;
+  if (!ch_amount)
+    ch_amount++;
+  if (reused_lut == nullptr) {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_blocks, true);
+  } else {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_blocks, reused_lut);
+  }
+  auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
+  auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
+
+  // define functions for each accumulator
+  auto lut_f_message = [message_modulus](Torus x) -> Torus {
+    return x % message_modulus;
+  };
+  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+    return x / message_modulus;
+  };
+
+  // generate accumulators
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_message);
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_carry);
+  luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  while (r > 2) {
    size_t cur_total_blocks = r * num_blocks;
@@ -257,12 +274,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    size_t sm_size = big_lwe_size * sizeof(Torus);

    cudaSetDevice(gpu_indexes[0]);
-    if (sm_size < max_shared_memory)
-      tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
-          new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
-    else
-      tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
-          new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
+        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);

    check_cuda_error(cudaGetLastError());

@@ -275,46 +288,21 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);
-
-    // create lut object for message and carry
-    // we allocate luts_message_carry in the host function (instead of scratch)
-    // to reduce average memory consumption
-    auto luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2, total_count, true);
-
-    auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
-    auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
-
-    // define functions for each accumulator
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    // generate accumulators
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], message_acc, glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, lut_f_message);
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, lut_f_carry);
-
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
+    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
+                                        h_lwe_idx_in, h_lwe_idx_out);

-    size_t copy_size = total_count * sizeof(Torus);
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
-                             streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
-                             streams[0], gpu_indexes[0]);
-    copy_size = sm_copy_count * sizeof(int32_t);
+    size_t copy_size = sm_copy_count * sizeof(int32_t);
    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
                             streams[0], gpu_indexes[0]);
    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
                             streams[0], gpu_indexes[0]);

+    // inside d_smart_copy_in there are only -1 values
+    // it's fine to call smart_copy with same pointer
+    // as source and destination
    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
@@ -328,28 +316,97 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(

    luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
-    /// Apply KS to go from a big LWE dimension to a small LWE dimension
-    /// After this keyswitch execution, we need to synchronize the streams
-    /// because the keyswitch and PBS do not operate on the same number of
-    /// inputs
-    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
-                             lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
-                             polynomial_size * glwe_dimension, lwe_dimension,
-                             mem_ptr->params.ks_base_log,
-                             mem_ptr->params.ks_level, message_count, true);
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
+    std::vector<Torus *> small_lwe_vector_vec =
+        luts_message_carry->lwe_after_ks_vec;
+    std::vector<Torus *> lwe_after_pbs_vec =
+        luts_message_carry->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec =
+        luts_message_carry->lwe_trivial_indexes_vec;

-    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-    /// dimension to a big LWE dimension
-    execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
-                       lwe_indexes_out, luts_message_carry->lut_vec,
-                       luts_message_carry->lut_indexes_vec, small_lwe_vector,
-                       lwe_indexes_in, bsks, luts_message_carry->buffer,
-                       glwe_dimension, lwe_dimension, polynomial_size,
-                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-                       mem_ptr->params.grouping_factor, total_count, 2, 0,
-                       max_shared_memory, mem_ptr->params.pbs_type, true);
-    luts_message_carry->release(streams, gpu_indexes, gpu_count);
+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
+    if (active_gpu_count == 1) {
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks,
+          lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
+          small_lwe_dimension, mem_ptr->params.ks_base_log,
+          mem_ptr->params.ks_level, message_count);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
+          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+          small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
+          glwe_dimension, small_lwe_dimension, polynomial_size,
+          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+          mem_ptr->params.grouping_factor, total_count,
+          mem_ptr->params.pbs_type);
+    } else {
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
+          luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size);
+
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
+          ksks, big_lwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
+
+      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
+      /// different configuration
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector,
+          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          small_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
+          glwe_dimension, small_lwe_dimension, polynomial_size,
+          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+          mem_ptr->params.grouping_factor, total_count,
+          mem_ptr->params.pbs_type);
+
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
+          luts_message_carry->h_lwe_indexes_out,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          big_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+    }

    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
    int new_blocks_created = 2 * ch_amount * num_blocks;
@@ -362,17 +419,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    std::swap(new_blocks, old_blocks);
    r = (new_blocks_created + rem_blocks) / num_blocks;
  }
+  luts_message_carry->release(streams, gpu_indexes, gpu_count);
+  delete (luts_message_carry);

  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
                num_blocks);
-
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-                                     radix_lwe_out, nullptr, nullptr,
-                                     mem_ptr->scp_mem, bsks, ksks, num_blocks);
 }

-template <typename Torus, typename STorus, class params>
+template <typename Torus, class params>
 __host__ void host_integer_mult_radix_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
@@ -464,8 +519,7 @@ __host__ void host_integer_mult_radix_kb(
  fill_radix_from_lsb_msb<Torus, params>
      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
-                       glwe_dimension, lsb_vector_block_count,
-                       msb_vector_block_count, num_blocks);
+                       glwe_dimension, num_blocks);
  check_cuda_error(cudaGetLastError());

  int terms_degree[2 * num_blocks * num_blocks];
@@ -481,10 +535,15 @@ __host__ void host_integer_mult_radix_kb(
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }

-  host_integer_sum_ciphertexts_vec_kb<Torus, params>(
+  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
      terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks);
+      2 * num_blocks, mem_ptr->luts_array);
+
+  auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
+  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
+                                     radix_lwe_out, nullptr, nullptr,
+                                     scp_mem_ptr, bsks, ksks, num_blocks);
 }

 template <typename Torus>
@@ -492,22 +551,6 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
-    check_cuda_error(cudaFuncSetAttribute(
-        tree_add_chunks<Torus, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
-                           cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(
-        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
-    check_cuda_error(cudaGetLastError());
-  }
-
  *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
                                       num_radix_blocks, allocate_gpu_memory);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -38,65 +38,13 @@ void cuda_integer_radix_overflowing_sub_kb_64(

  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;

-  switch (mem->params.polynomial_size) {
-  case 512:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-        mem, num_blocks);
-    break;
-  case 1024:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-        mem, num_blocks);
-    break;
-  case 2048:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-        mem, num_blocks);
-    break;
-  case 4096:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-        mem, num_blocks);
-    break;
-  case 8192:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-        mem, num_blocks);
-    break;
-  case 16384:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-        mem, num_blocks);
-    break;
-  default:
-    PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
-  }
+  host_integer_overflowing_sub_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(radix_lwe_out),
+      static_cast<uint64_t *>(radix_lwe_overflowed),
+      static_cast<uint64_t *>(radix_lwe_left),
+      static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks), mem,
+      num_blocks);
 }

 void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -98,7 +98,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }

-template <typename Torus, class params>
+template <typename Torus>
 __host__ void host_integer_overflowing_sub_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
@@ -113,9 +113,9 @@ __host__ void host_integer_overflowing_sub_kb(
      radix_params.message_modulus, radix_params.carry_modulus,
      radix_params.message_modulus - 1);

-  host_propagate_single_sub_borrow<Torus>(
-      streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
-      mem_ptr->borrow_prop_mem, bsks, ksks, num_blocks);
+  host_propagate_single_sub_borrow<Torus>(streams, gpu_indexes, gpu_count,
+                                          radix_lwe_overflowed, radix_lwe_out,
+                                          mem_ptr, bsks, ksks, num_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -2,7 +2,6 @@
 #define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH

 #include "integer/comparison.cuh"
-#include <omp.h>

 template <typename Torus>
 __host__ void integer_radix_unsigned_scalar_difference_check_kb(
@@ -87,53 +86,43 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }

-#pragma omp parallel sections
-    {
-      // Both sections may be executed in parallel
-#pragma omp section
-      {
-        //////////////
-        // lsb
-        Torus *lhs = diff_buffer->tmp_packed_left;
-        Torus *rhs = diff_buffer->tmp_packed_right;
+    //////////////
+    // lsb
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                    total_num_scalar_blocks, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                total_num_scalar_blocks, message_modulus);

-        // From this point we have half number of blocks
-        num_lsb_radix_blocks /= 2;
-        num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
+    // From this point we have half number of blocks
+    num_lsb_radix_blocks /= 2;
+    num_lsb_radix_blocks += (total_num_scalar_blocks % 2);

-        // comparisons will be assigned
-        // - 0 if lhs < rhs
-        // - 1 if lhs == rhs
-        // - 2 if lhs > rhs
+    // comparisons will be assigned
+    // - 0 if lhs < rhs
+    // - 1 if lhs == rhs
+    // - 2 if lhs > rhs

-        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                       comparisons, lhs, rhs, mem_ptr, bsks,
-                                       ksks, num_lsb_radix_blocks);
+    auto comparisons = mem_ptr->tmp_block_comparisons;
+    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
+                                   num_lsb_radix_blocks);

-        // Reduces a vec containing radix blocks that encrypts a sign
-        // (inferior, equal, superior) to one single radix block containing the
-        // final sign
-        tree_sign_reduction(
-            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
-            mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
-            ksks, num_lsb_radix_blocks);
-      }
-#pragma omp section
-      {
-        //////////////
-        // msb
-        host_compare_with_zero_equality(
-            msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb,
-            mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
+    // Reduces a vec containing radix blocks that encrypts a sign
+    // (inferior, equal, superior) to one single radix block containing the
+    // final sign
+    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
+                        comparisons, mem_ptr->diff_buffer->tree_buffer,
+                        mem_ptr->identity_lut_f, bsks, ksks,
+                        num_lsb_radix_blocks);
+    //////////////
+    // msb
+    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
+                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
+                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
    }
@@ -205,7 +194,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -311,93 +299,83 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }

-#pragma omp parallel sections
-    {
-      // Both sections may be executed in parallel
-#pragma omp section
-      {
-        //////////////
-        // lsb
-        Torus *lhs = diff_buffer->tmp_packed_left;
-        Torus *rhs = diff_buffer->tmp_packed_right;
+    //////////////
+    // lsb
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                    total_num_scalar_blocks, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                total_num_scalar_blocks, message_modulus);

-        // From this point we have half number of blocks
-        num_lsb_radix_blocks /= 2;
-        num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
+    // From this point we have half number of blocks
+    num_lsb_radix_blocks /= 2;
+    num_lsb_radix_blocks += (total_num_scalar_blocks % 2);

-        // comparisons will be assigned
-        // - 0 if lhs < rhs
-        // - 1 if lhs == rhs
-        // - 2 if lhs > rhs
+    // comparisons will be assigned
+    // - 0 if lhs < rhs
+    // - 1 if lhs == rhs
+    // - 2 if lhs > rhs

-        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                       comparisons, lhs, rhs, mem_ptr, bsks,
-                                       ksks, num_lsb_radix_blocks);
+    auto comparisons = mem_ptr->tmp_block_comparisons;
+    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
+                                   num_lsb_radix_blocks);

-        // Reduces a vec containing radix blocks that encrypts a sign
-        // (inferior, equal, superior) to one single radix block containing the
-        // final sign
-        tree_sign_reduction(
-            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
-            mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
-            ksks, num_lsb_radix_blocks);
+    // Reduces a vec containing radix blocks that encrypts a sign
+    // (inferior, equal, superior) to one single radix block containing the
+    // final sign
+    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
+                        comparisons, mem_ptr->diff_buffer->tree_buffer,
+                        mem_ptr->identity_lut_f, bsks, ksks,
+                        num_lsb_radix_blocks);
+    //////////////
+    // msb
+    // We remove the last block (which is the sign)
+    Torus *are_all_msb_zeros = lwe_array_msb_out;
+    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
+                                    are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
+                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+
+    auto sign_bit_pos = (int)log2(message_modulus) - 1;
+
+    auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
+                                         Torus msb_are_zeros) {
+      bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
+      CMP_ORDERING sign_block_ordering;
+      if (sign_bit_is_set) {
+        sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
+      } else if (sign_block != 0) {
+        sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
+      } else {
+        sign_block_ordering = CMP_ORDERING::IS_EQUAL;
      }
-#pragma omp section
-      {
-        //////////////
-        // msb
-        // We remove the last block (which is the sign)
-        Torus *are_all_msb_zeros = lwe_array_msb_out;
-        host_compare_with_zero_equality(
-            msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb,
-            mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);

-        auto sign_bit_pos = (int)log2(message_modulus) - 1;
+      CMP_ORDERING msb_ordering;
+      if (msb_are_zeros == 1)
+        msb_ordering = CMP_ORDERING::IS_EQUAL;
+      else
+        msb_ordering = CMP_ORDERING::IS_SUPERIOR;

-        auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
-                                             Torus msb_are_zeros) {
-          bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
-          CMP_ORDERING sign_block_ordering;
-          if (sign_bit_is_set) {
-            sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
-          } else if (sign_block != 0) {
-            sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
-          } else {
-            sign_block_ordering = CMP_ORDERING::IS_EQUAL;
-          }
+      return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
+          sign_block_ordering, msb_ordering);
+    };

-          CMP_ORDERING msb_ordering;
-          if (msb_are_zeros == 1)
-            msb_ordering = CMP_ORDERING::IS_EQUAL;
-          else
-            msb_ordering = CMP_ORDERING::IS_SUPERIOR;
+    auto signed_msb_lut = mem_ptr->signed_msb_lut;
+    generate_device_accumulator_bivariate<Torus>(
+        msb_streams[0], gpu_indexes[0],
+        signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_f);
+    signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-          return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
-              sign_block_ordering, msb_ordering);
-        };
-
-        auto signed_msb_lut = mem_ptr->signed_msb_lut;
-        generate_device_accumulator_bivariate<Torus>(
-            msb_streams[0], gpu_indexes[0],
-            signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_f);
-        signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-
-        Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-        integer_radix_apply_bivariate_lookup_table_kb(
-            msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
-            are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
-            signed_msb_lut->params.message_modulus);
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
+    Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
+    integer_radix_apply_bivariate_lookup_table_kb(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
+        are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
+        signed_msb_lut->params.message_modulus);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
    }
@@ -422,50 +400,38 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_sign_out =
        lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
-#pragma omp parallel sections
-    {
-      // Both sections may be executed in parallel
-#pragma omp section
-      {
-        Torus *lhs = diff_buffer->tmp_packed_left;
-        Torus *rhs = diff_buffer->tmp_packed_right;
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                    big_lwe_dimension, num_lsb_radix_blocks - 1,
-                    message_modulus);
-        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                    num_lsb_radix_blocks - 1, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                num_lsb_radix_blocks - 1, message_modulus);

-        // From this point we have half number of blocks
-        num_lsb_radix_blocks /= 2;
+    // From this point we have half number of blocks
+    num_lsb_radix_blocks /= 2;

-        // comparisons will be assigned
-        // - 0 if lhs < rhs
-        // - 1 if lhs == rhs
-        // - 2 if lhs > rhs
-        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                       lwe_array_ct_out, lhs, rhs, mem_ptr,
-                                       bsks, ksks, num_lsb_radix_blocks);
-      }
-#pragma omp section
-      {
-        Torus *encrypted_sign_block =
-            lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
-        Torus *scalar_sign_block =
-            scalar_blocks + (total_num_scalar_blocks - 1);
+    // comparisons will be assigned
+    // - 0 if lhs < rhs
+    // - 1 if lhs == rhs
+    // - 2 if lhs > rhs
+    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                   lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
+                                   ksks, num_lsb_radix_blocks);
+    Torus *encrypted_sign_block =
+        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
+    Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);

-        auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-        create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
-                             scalar_sign_block, big_lwe_dimension, 1, 1,
-                             message_modulus, carry_modulus);
+    auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
+    create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
+                         scalar_sign_block, big_lwe_dimension, 1, 1,
+                         message_modulus, carry_modulus);

-        integer_radix_apply_bivariate_lookup_table_kb(
-            msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
-            encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
-            mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
+    integer_radix_apply_bivariate_lookup_table_kb(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
+        encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
+        mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
    }
@@ -566,6 +532,8 @@ __host__ void scalar_compare_radix_blocks_kb(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

+  if (num_radix_blocks == 0)
+    return;
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -686,58 +654,47 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
  auto lsb_streams = mem_ptr->lsb_streams;
  auto msb_streams = mem_ptr->msb_streams;

-#pragma omp parallel sections
-  {
-    // Both sections may be executed in parallel
-#pragma omp section
-    {
-      if (num_halved_scalar_blocks > 0) {
-        auto packed_blocks = mem_ptr->tmp_packed_input;
-        auto packed_scalar =
-            packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
+  if (num_halved_scalar_blocks > 0) {
+    auto packed_blocks = mem_ptr->tmp_packed_input;
+    auto packed_scalar =
+        packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;

-        pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
-                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar,
-                    scalar_blocks, 0, num_scalar_blocks, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
+                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
+                num_scalar_blocks, message_modulus);

-        cuda_memcpy_async_gpu_to_gpu(
-            scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
-            packed_scalar, num_halved_scalar_blocks * sizeof(Torus),
-            lsb_streams[0], gpu_indexes[0]);
-        scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
+    cuda_memcpy_async_gpu_to_gpu(
+        scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
+        packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
+        gpu_indexes[0]);
+    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

-        integer_radix_apply_univariate_lookup_table_kb(
-            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-            packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks,
-            scalar_comparison_luts);
-      }
+    integer_radix_apply_univariate_lookup_table_kb(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
+        bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
+  }
+  //////////////
+  // msb
+  if (num_msb_radix_blocks > 0) {
+    int_radix_lut<Torus> *msb_lut;
+    switch (mem_ptr->op) {
+    case COMPARISON_TYPE::EQ:
+      msb_lut = mem_ptr->is_zero_lut;
+      break;
+    case COMPARISON_TYPE::NE:
+      msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+      break;
+    default:
+      PANIC("Cuda error: integer operation not supported")
    }
-#pragma omp section
-    {
-      //////////////
-      // msb
-      if (num_msb_radix_blocks > 0) {
-        int_radix_lut<Torus> *msb_lut;
-        switch (mem_ptr->op) {
-        case COMPARISON_TYPE::EQ:
-          msb_lut = mem_ptr->is_zero_lut;
-          break;
-        case COMPARISON_TYPE::NE:
-          msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-          break;
-        default:
-          PANIC("Cuda error: integer operation not supported")
-        }

-        host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                        lwe_array_msb_out, msb, mem_ptr, bsks,
-                                        ksks, num_msb_radix_blocks, msb_lut);
-      }
-    }
+    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
+                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
+                                    num_msb_radix_blocks, msb_lut);
  }

-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
    cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
    cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -33,22 +33,6 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
    int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
-  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
-    check_cuda_error(cudaFuncSetAttribute(
-        tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-        sm_size));
-    cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
-                           cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(
-        cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
-    check_cuda_error(cudaGetLastError());
-  }
-
  *mem_ptr =
      new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
                                   num_radix_blocks, allocate_gpu_memory);
@@ -108,6 +92,10 @@ __host__ void host_integer_scalar_mul_radix(
    }
  }

+  cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+  mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
+  delete (mem->logical_scalar_shift_buffer);
+
  if (j == 0) {
    // lwe array = 0
    cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes,
@@ -117,10 +105,15 @@ __host__ void host_integer_scalar_mul_radix(
    for (int i = 0; i < j * num_radix_blocks; i++) {
      terms_degree[i] = message_modulus - 1;
    }
-    host_integer_sum_ciphertexts_vec_kb<T, params>(
+    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
        num_radix_blocks, j);
+
+    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
+    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
+                                   nullptr, nullptr, scp_mem_ptr, bsks, ksks,
+                                   num_radix_blocks);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

  Torus *rotated_buffer = mem->tmp_rotated;

-  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
-
  // rotate right all the blocks in radix ciphertext
  // copy result in new buffer
  // 256 threads are used in every block
@@ -76,6 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
                                   giver_blocks, lwe_array, 1, num_blocks,
                                   big_lwe_size);

+    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
@@ -100,6 +100,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
                                  lwe_array, 1, num_blocks, big_lwe_size);

+    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -10,7 +10,6 @@
 #include "types/complex/operations.cuh"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
-#include <omp.h>

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
@@ -52,13 +51,6 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
  Torus *full_rotated_buffer = mem->tmp_rotated;
  Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];

-  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
-
-  // rotate right all the blocks in radix ciphertext
-  // copy result in new buffer
-  // 1024 threads are used in every block
-  // block_count blocks will be used in the grid
-  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
@@ -76,6 +68,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
      return;
    }

+    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
    auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
    auto partial_previous_blocks =
        &full_rotated_buffer[rotations * big_lwe_size];
@@ -109,6 +102,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    auto partial_current_blocks = lwe_array;
    auto partial_next_blocks = &rotated_buffer[big_lwe_size];
+    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

    size_t partial_block_count = num_blocks - rotations;

@@ -139,8 +133,6 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
    int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
    uint32_t num_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
-
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -160,15 +152,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  size_t shift_within_block = shift % num_bits_in_block;

  Torus *rotated_buffer = mem->tmp_rotated;
-  Torus *padding_block = &rotated_buffer[num_blocks * big_lwe_size];
+  Torus *padding_block = &rotated_buffer[(num_blocks + 1) * big_lwe_size];
  Torus *last_block_copy = &padding_block[big_lwe_size];

-  auto lut_univariate_shift_last_block =
-      mem->lut_buffers_univariate[shift_within_block - 1];
-  auto lut_univariate_padding_block =
-      mem->lut_buffers_univariate[num_bits_in_block - 1];
-  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
-
  if (mem->shift_type == RIGHT_SHIFT) {
    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
                                  rotated_buffer, lwe_array, rotations,
@@ -197,59 +183,59 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      return;
    }

-    // In the arithmetic shift case we have to pad with the value of the sign
-    // bit. This creates the need for a different shifting lut than in the
-    // logical shift case. We also need another PBS to create the padding block.
-    Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
-    cuda_memcpy_async_gpu_to_gpu(
-        last_block_copy,
-        rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
-        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-    auto partial_current_blocks = lwe_array;
-    auto partial_next_blocks = &rotated_buffer[big_lwe_size];
-    size_t partial_block_count = num_blocks - rotations;
-    if (shift_within_block != 0 && rotations != num_blocks) {
-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, partial_current_blocks,
-          partial_current_blocks, partial_next_blocks, bsks, ksks,
-          partial_block_count, lut_bivariate,
-          lut_bivariate->params.message_modulus);
-    }
-    // Since our CPU threads will be working on different streams we shall
-    // assert the work in the main stream is completed
-    for (uint j = 0; j < gpu_count; j++) {
-      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-    }
-#pragma omp parallel sections
-    {
-      // All sections may be executed in parallel
-#pragma omp section
-      {
-        integer_radix_apply_univariate_lookup_table_kb(
-            mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
-            last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
-        // Replace blocks 'pulled' from the left with the correct padding block
-        for (uint i = 0; i < rotations; i++) {
-          cuda_memcpy_async_gpu_to_gpu(
-              lwe_array + (num_blocks - rotations + i) * big_lwe_size,
-              padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
-              gpu_indexes[0]);
-        }
-      }
-#pragma omp section
-      {
-        if (shift_within_block != 0 && rotations != num_blocks) {
-          integer_radix_apply_univariate_lookup_table_kb(
-              mem->local_streams_2, gpu_indexes, gpu_count, last_block,
-              last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
-        }
-      }
-    }
-    for (uint j = 0; j < gpu_count; j++) {
-      cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
-      cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
-    }
+    if (num_blocks != rotations) {
+      // In the arithmetic shift case we have to pad with the value of the sign
+      // bit. This creates the need for a different shifting lut than in the
+      // logical shift case. We also need another PBS to create the padding
+      // block.
+      Torus *last_block =
+          lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
+      cuda_memcpy_async_gpu_to_gpu(
+          last_block_copy,
+          rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
+          big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+      if (shift_within_block != 0) {
+        auto partial_current_blocks = lwe_array;
+        auto partial_next_blocks = &rotated_buffer[big_lwe_size];
+        size_t partial_block_count = num_blocks - rotations;
+        auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

+        integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+            streams, gpu_indexes, gpu_count, partial_current_blocks,
+            partial_current_blocks, partial_next_blocks, bsks, ksks,
+            partial_block_count, lut_bivariate,
+            lut_bivariate->params.message_modulus);
+      }
+      // Since our CPU threads will be working on different streams we shall
+      // assert the work in the main stream is completed
+      for (uint j = 0; j < gpu_count; j++) {
+        cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+      }
+      auto lut_univariate_padding_block =
+          mem->lut_buffers_univariate[num_bits_in_block - 1];
+      integer_radix_apply_univariate_lookup_table_kb(
+          mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
+          last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
+      // Replace blocks 'pulled' from the left with the correct padding
+      // block
+      for (uint i = 0; i < rotations; i++) {
+        cuda_memcpy_async_gpu_to_gpu(lwe_array + (num_blocks - rotations + i) *
+                                                     big_lwe_size,
+                                     padding_block, big_lwe_size_bytes,
+                                     mem->local_streams_1[0], gpu_indexes[0]);
+      }
+      if (shift_within_block != 0) {
+        auto lut_univariate_shift_last_block =
+            mem->lut_buffers_univariate[shift_within_block - 1];
+        integer_radix_apply_univariate_lookup_table_kb(
+            mem->local_streams_2, gpu_indexes, gpu_count, last_block,
+            last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
+      }
+      for (uint j = 0; j < mem->active_gpu_count; j++) {
+        cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
+        cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
+      }
+    }
  } else {
    PANIC("Cuda error (scalar shift): left scalar shift is never of the "
          "arithmetic type")
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  auto big_lwe_size = big_lwe_dimension + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

-  cudaSetDevice(gpu_indexes[0]);
-
  // Extract all bits
  auto bits = mem->tmp_bits;
  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
@@ -36,6 +36,18 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
 }

 // We need these lines so the compiler knows how to specialize these functions
+template __device__ const uint64_t *
+get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
+                       uint32_t polynomial_size, int glwe_dimension,
+                       uint32_t level_count);
+template __device__ const uint32_t *
+get_ith_mask_kth_block(const uint32_t *ptr, int i, int k, int level,
+                       uint32_t polynomial_size, int glwe_dimension,
+                       uint32_t level_count);
+template __device__ const double2 *
+get_ith_mask_kth_block(const double2 *ptr, int i, int k, int level,
+                       uint32_t polynomial_size, int glwe_dimension,
+                       uint32_t level_count);
 template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
                                                     int k, int level,
                                                     uint32_t polynomial_size,
@@ -51,6 +63,7 @@ template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
                                                    uint32_t polynomial_size,
                                                    int glwe_dimension,
                                                    uint32_t level_count);
+
 template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
                                                     int k, int level,
                                                     uint32_t polynomial_size,
@@ -67,10 +80,12 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                    int glwe_dimension,
                                                    uint32_t level_count);

-template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
-    uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const uint64_t *ptr, int g, int i, int k, int level,
+    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
+    uint32_t level_count);

-template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
-    double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const double2 *ptr, int g, int i, int k, int level,
+    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
+    uint32_t level_count);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -17,6 +17,18 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
 }

 ////////////////////////////////////////////////
+template <typename T>
+__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
+                                           int level, uint32_t polynomial_size,
+                                           int glwe_dimension,
+                                           uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
+                                 level_count) +
+              level * polynomial_size / 2 * (glwe_dimension + 1) *
+                  (glwe_dimension + 1) +
+              k * polynomial_size / 2 * (glwe_dimension + 1)];
+}
+
 template <typename T>
 __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
@@ -27,7 +39,6 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                  (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }
-
 template <typename T>
 __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
@@ -50,14 +61,16 @@ __device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
 }

 template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
-  T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
-                                         glwe_dimension, level_count);
+  const T *ptr_group =
+      ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
+                              glwe_dimension, level_count);
  return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
                                glwe_dimension, level_count);
 }
+
 ////////////////////////////////////////////////
 template <typename T, typename ST>
 void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
@@ -77,7 +90,8 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
  int gridSize = total_polynomials;
  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);

-  double2 *h_bsk = (double2 *)malloc(buffer_size);
+  double2 *h_bsk;
+  cudaMallocHost((void **)&h_bsk, buffer_size);

  double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);

@@ -101,7 +115,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
  double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
  switch (polynomial_size) {
  case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -119,7 +133,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -137,7 +151,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -155,7 +169,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -173,7 +187,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -191,7 +205,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -209,7 +223,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -233,7 +247,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,

  cuda_drop_async(d_bsk, stream, gpu_index);
  cuda_drop_async(buffer, stream, gpu_index);
-  free(h_bsk);
+  cudaFreeHost(h_bsk);
 }

 void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
@@ -254,7 +268,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  double2 *buffer;
  switch (polynomial_size) {
  case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
@@ -275,7 +289,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
    }
    break;
  case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
@@ -296,7 +310,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
    }
    break;
  case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
@@ -317,7 +331,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
    }
    break;
  case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
@@ -338,7 +352,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
    }
    break;
  case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
@@ -359,7 +373,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
    }
    break;
  case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
@@ -380,7 +394,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
    }
    break;
  case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -1,14 +1,13 @@
 #ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
 #define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH

+#include "cooperative_groups.h"
 #include "device.h"
 #include "fft/bnsmfft.cuh"
+#include "helper_multi_gpu.h"
 #include "programmable_bootstrap.h"
 #include "programmable_bootstrap_multibit.h"

-#include "cooperative_groups.h"
-#include "helper_multi_gpu.h"
-
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;

@@ -22,11 +21,11 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
                        uint32_t glwe_dimension, bool support_dsm);

 template <typename Torus, typename G, class params>
-__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
-                              double2 *join_buffer, double2 *bootstrapping_key,
-                              int polynomial_size, uint32_t glwe_dimension,
-                              int level_count, int iteration, G &group,
-                              bool support_dsm = false) {
+__device__ void
+mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
+              const double2 *__restrict__ bootstrapping_key,
+              int polynomial_size, uint32_t glwe_dimension, int level_count,
+              int iteration, G &group, bool support_dsm = false) {

  // Switch to the FFT space
  NSMFFT_direct<HalfDegree<params>>(fft);
@@ -118,22 +117,17 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
 }

 template <typename Torus>
-void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
-                 uint32_t gpu_count, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, std::vector<Torus *> lut_vec,
-                 std::vector<Torus *> lut_indexes_vec, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, void **bootstrapping_keys,
-                 std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
-                 uint32_t lwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count,
-                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-                 uint32_t num_luts, uint32_t lwe_idx,
-                 uint32_t max_shared_memory, PBS_TYPE pbs_type,
-                 bool sync_streams = true) {
-  auto active_gpu_count =
-      get_active_gpu_count(input_lwe_ciphertext_count, gpu_count);
-  if (sync_streams)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+void execute_pbs_async(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    const LweArrayVariant<Torus> &lwe_array_out,
+    const LweArrayVariant<Torus> &lwe_output_indexes,
+    std::vector<Torus *> lut_vec, std::vector<Torus *> lut_indexes_vec,
+    const LweArrayVariant<Torus> &lwe_array_in,
+    const LweArrayVariant<Torus> &lwe_input_indexes, void **bootstrapping_keys,
+    std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -141,20 +135,31 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
    case MULTI_BIT:
      PANIC("Error: 32-bit multibit PBS is not supported.\n")
    case CLASSICAL:
-#pragma omp parallel for num_threads(active_gpu_count)
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
        int num_inputs_on_gpu =
            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
        int gpu_offset =
            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
        auto d_lut_vector_indexes =
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+        Torus *current_lwe_output_indexes =
+            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        Torus *current_lwe_input_indexes =
+            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+
        cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-            lwe_idx, max_shared_memory, gpu_offset);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu);
      }
      break;
    default:
@@ -168,38 +173,60 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
    case MULTI_BIT:
      if (grouping_factor == 0)
        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
-#pragma omp parallel for num_threads(active_gpu_count)
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
        int num_inputs_on_gpu =
            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+        Torus *current_lwe_output_indexes =
+            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        Torus *current_lwe_input_indexes =
+            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+
        int gpu_offset =
            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
        auto d_lut_vector_indexes =
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
            polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory,
-            gpu_offset);
+            num_inputs_on_gpu);
      }
      break;
    case CLASSICAL:
-#pragma omp parallel for num_threads(active_gpu_count)
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
        int num_inputs_on_gpu =
            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+        Torus *current_lwe_output_indexes =
+            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        Torus *current_lwe_input_indexes =
+            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+
        int gpu_offset =
            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
        auto d_lut_vector_indexes =
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-            lwe_idx, max_shared_memory, gpu_offset);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu);
      }
      break;
    default:
@@ -210,11 +237,6 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
          "moduli are supported.")
  }
-
-  if (sync_streams)
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
 }

 template <typename Torus>
@@ -222,8 +244,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                         int8_t **pbs_buffer, uint32_t glwe_dimension,
                         uint32_t lwe_dimension, uint32_t polynomial_size,
                         uint32_t level_count, uint32_t grouping_factor,
-                         uint32_t input_lwe_ciphertext_count,
-                         uint32_t max_shared_memory, PBS_TYPE pbs_type,
+                         uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
                         bool allocate_gpu_memory) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
@@ -234,8 +255,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case CLASSICAL:
      scratch_cuda_programmable_bootstrap_32(
          stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
+          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    default:
      PANIC("Error: unsupported cuda PBS type.")
@@ -250,13 +270,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
      scratch_cuda_multi_bit_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, grouping_factor,
-          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+          input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    case CLASSICAL:
      scratch_cuda_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
+          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    default:
      PANIC("Error: unsupported cuda PBS type.")
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -5,10 +5,9 @@
 */
 uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
  return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-      max_shared_memory);
+      glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
 }

 /*
@@ -20,58 +19,50 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
 void scratch_cuda_programmable_bootstrap_amortized_32(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<256>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 512:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<512>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 1024:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<1024>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 2048:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<2048>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 4096:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<4096>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 8192:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<8192>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 16384:
-    scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
-                                             AmortizedDegree<16384>>(
+    scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -88,58 +79,50 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
 void scratch_cuda_programmable_bootstrap_amortized_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<256>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 512:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<512>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 1024:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<1024>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 2048:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<2048>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 4096:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<4096>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 8192:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<8192>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 16384:
-    scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
-                                             AmortizedDegree<16384>>(
+    scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -157,8 +140,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_samples) {

  if (base_log > 32)
    PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -172,7 +154,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 512:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
@@ -181,7 +163,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 1024:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
@@ -190,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 2048:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
@@ -199,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 4096:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
@@ -208,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 8192:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
@@ -217,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 16384:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
@@ -226,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -268,17 +250,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
 * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
 * - level_count: number of decomposition levels in the gadget matrix (~4)
 * - num_samples: number of encrypted input messages
- * - num_luts: parameter to set the actual number of luts to be
 * used
- * - lwe_idx: the index of the LWE input to consider for the GPU of index
- * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
- * the input LWE array is copied to each GPU, but the whole LUT array is copied
- * (because the case when the number of LUTs is smaller than the number of input
- * LWEs is not trivial to take into account in the data repartition on the
- * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
- * input in the LUT array `lut_vector`.
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
 *
 * This function calls a wrapper to a device kernel that performs the
 * bootstrapping:
@@ -306,8 +278,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_samples) {

  if (base_log > 64)
    PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -321,7 +292,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 512:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
@@ -330,7 +301,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 1024:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
@@ -339,7 +310,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 2048:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
@@ -348,7 +319,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 4096:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
@@ -357,7 +328,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 8192:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
@@ -366,7 +337,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  case 16384:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
@@ -375,7 +346,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples);
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -392,7 +363,6 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer) {

-  check_cuda_error(cudaSetDevice(gpu_index));
  // Free memory
  cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -42,17 +42,19 @@ template <typename Torus, class params, sharedMemDegree SMD>
 *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
 *  - level_count: number of decomposition levels in the gadget matrix (~4)
 *  - gpu_num: index of the current GPU (useful for multi-GPU computations)
- *  - lwe_idx: equal to the number of samples per gpu x gpu_num
 *  - device_memory_size_per_sample: amount of global memory to allocate if SMD
 * is not FULLSM
 */
 __global__ void device_programmable_bootstrap_amortized(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_idx,
-    size_t device_memory_size_per_sample, uint32_t gpu_offset) {
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count,
+    size_t device_memory_size_per_sample) {
  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
  // much faster than global memory
@@ -79,16 +81,15 @@ __global__ void device_programmable_bootstrap_amortized(
                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);

  auto block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] *
-                    (lwe_dimension + 1)];
-  Torus *block_lut_vector =
-      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
                  (glwe_dimension + 1)];

  // Put "b", the body, in [0, 2N[
  Torus b_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                        2 * params::degree); // 2 * params::log2_degree + 1);
+  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                 params::log2_degree + 1);

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
@@ -103,8 +104,8 @@ __global__ void device_programmable_bootstrap_amortized(

    // Put "a" in [0, 2N[ instead of Zq
    Torus a_hat = 0;
-    rescale_torus_element(block_lwe_array_in[iteration], a_hat,
-                          2 * params::degree); // 2 * params::log2_degree + 1);
+    modulus_switch(block_lwe_array_in[iteration], a_hat,
+                   params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
@@ -198,7 +199,7 @@ __global__ void device_programmable_bootstrap_amortized(
  }

  auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.x] *
                     (glwe_dimension * polynomial_size + 1)];

  // The blind rotation for this block is over
@@ -212,8 +213,7 @@ __global__ void device_programmable_bootstrap_amortized(
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_programmable_bootstrap_amortized(
+uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
    uint32_t polynomial_size, uint32_t glwe_dimension) {
  return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
         sizeof(Torus) * polynomial_size *
@@ -224,17 +224,17 @@ get_buffer_size_full_sm_programmable_bootstrap_amortized(
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_programmable_bootstrap_amortized(
+uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
    uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft
 }

 template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
+uint64_t get_buffer_size_programmable_bootstrap_amortized(
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
@@ -252,20 +252,19 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
  return device_mem + device_mem % sizeof(double2);
 }

-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_programmable_bootstrap_amortized(
    cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
  uint64_t partial_sm =
      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    cudaFuncSetAttribute(
        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -284,8 +283,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
  if (allocate_gpu_memory) {
    uint64_t buffer_size =
        get_buffer_size_programmable_bootstrap_amortized<Torus>(
-            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-            max_shared_memory);
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
    check_cuda_error(cudaGetLastError());
  }
@@ -298,10 +296,8 @@ __host__ void host_programmable_bootstrap_amortized(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(gpu_index);
  uint64_t SM_FULL =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
@@ -314,6 +310,9 @@ __host__ void host_programmable_bootstrap_amortized(

  uint64_t DM_FULL = SM_FULL;

+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
  // Create a 1-dimensional grid of threads
  // where each block handles 1 sample and each thread
  // handles opt polynomial coefficients
@@ -333,14 +332,14 @@ __host__ void host_programmable_bootstrap_amortized(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
            glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_FULL, gpu_offset);
+            level_count, DM_FULL);
  } else if (max_shared_memory < SM_FULL) {
    device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
        <<<grid, thds, SM_PART, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
            glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_PART, gpu_offset);
+            level_count, DM_PART);
  } else {
    // For devices with compute capability 7.x a single thread block can
    // address the full capacity of shared memory. Shared memory on the
@@ -352,7 +351,7 @@ __host__ void host_programmable_bootstrap_amortized(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
            glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, 0, gpu_offset);
+            level_count, 0);
  }
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -36,12 +36,15 @@ namespace cg = cooperative_groups;
 */
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_cg(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    int8_t *device_mem, uint64_t device_memory_size_per_block,
-    uint32_t gpu_offset) {
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *device_mem,
+    uint64_t device_memory_size_per_block) {

  grid_group grid = this_grid();

@@ -74,12 +77,12 @@ __global__ void device_programmable_bootstrap_cg(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];

-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];

  double2 *block_join_buffer =
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
@@ -90,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg(

  // Put "b" in [0, 2N[
  Torus b_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                        2 * params::degree);
+  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                 params::log2_degree + 1);

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
@@ -103,8 +106,7 @@ __global__ void device_programmable_bootstrap_cg(

    // Put "a" in [0, 2N[
    Torus a_hat = 0;
-    rescale_torus_element(block_lwe_array_in[i], a_hat,
-                          2 * params::degree); // 2 * params::log2_degree + 1);
+    modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
@@ -140,7 +142,7 @@ __global__ void device_programmable_bootstrap_cg(
  }

  auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                         (glwe_dimension * polynomial_size + 1) +
                     blockIdx.y * polynomial_size];

@@ -154,20 +156,19 @@ __global__ void device_programmable_bootstrap_cg(
  }
 }

-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_programmable_bootstrap_cg(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
  uint64_t partial_sm =
      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
          polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    check_cuda_error(cudaFuncSetAttribute(
        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
@@ -201,9 +202,7 @@ __host__ void host_programmable_bootstrap_cg(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
-  cudaSetDevice(gpu_index);
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
@@ -214,6 +213,9 @@ __host__ void host_programmable_bootstrap_cg(
      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
          polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
  uint64_t full_dm = full_sm;

  uint64_t partial_dm = full_dm - partial_sm;
@@ -224,7 +226,7 @@ __host__ void host_programmable_bootstrap_cg(
  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

-  void *kernel_args[15];
+  void *kernel_args[14];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -238,7 +240,6 @@ __host__ void host_programmable_bootstrap_cg(
  kernel_args[10] = &base_log;
  kernel_args[11] = &level_count;
  kernel_args[12] = &d_mem;
-  kernel_args[14] = &gpu_offset;

  if (max_shared_memory < partial_sm) {
    kernel_args[13] = &full_dm;
@@ -264,8 +265,7 @@ __host__ void host_programmable_bootstrap_cg(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int level_count, int num_samples) {

  // If Cooperative Groups is not supported, no need to check anything else
  if (!cuda_check_support_cooperative_groups())
@@ -285,6 +285,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
  int max_active_blocks_per_sm;

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < partial_sm) {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
@@ -310,37 +311,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
  switch (polynomial_size) {
  case 256:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
  case 512:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
  case 1024:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
  case 2048:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
  case 4096:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
  case 8192:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
  case 16384:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                       num_samples);
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
          "Supported N's are powers of two"
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -18,15 +18,19 @@
 #include <vector>

 template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size,
-    uint32_t keybundle_size_per_input, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_cg_accumulate(
+        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const Torus *__restrict__ lwe_array_in,
+        const Torus *__restrict__ lwe_input_indexes,
+        const double2 *__restrict__ keybundle_array, double2 *join_buffer,
+        Torus *global_accumulator, uint32_t lwe_dimension,
+        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
+        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
+        int8_t *device_mem, uint64_t device_memory_size_per_block) {

  grid_group grid = this_grid();

@@ -54,12 +58,12 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];

-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];

  double2 *block_join_buffer =
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
@@ -69,15 +73,15 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
      global_accumulator +
      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;

-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle = keybundle_array +
+                             // select the input
+                             blockIdx.z * keybundle_size_per_input;

  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);

    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
@@ -117,7 +121,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(

  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                           (glwe_dimension * polynomial_size + 1) +
                       blockIdx.y * polynomial_size];

@@ -137,24 +141,21 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size; // accumulator
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size * 2; // accumulator
 }

 template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t grouping_factor, uint32_t lwe_chunk_size,
-    uint32_t max_shared_memory) {
+    uint32_t grouping_factor, uint32_t lwe_chunk_size) {

  uint64_t buffer_size = 0;
  buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
@@ -169,15 +170,13 @@ __host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
  return buffer_size + buffer_size % sizeof(double2);
 }

-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_cg_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
-    pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
          polynomial_size);
@@ -188,6 +187,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < full_sm_keybundle) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -240,11 +240,9 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  if (!lwe_chunk_size)
-    lwe_chunk_size =
-        get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                          polynomial_size, max_shared_memory);
-  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
      allocate_gpu_memory);
@@ -258,10 +256,8 @@ __host__ void execute_cg_external_product_loop(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset,
-    uint32_t gpu_offset) {
+    uint32_t lwe_chunk_size, int lwe_offset) {

-  cudaSetDevice(gpu_index);
  uint64_t full_dm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
@@ -270,6 +266,9 @@ __host__ void execute_cg_external_product_loop(
          polynomial_size);
  uint64_t no_dm = 0;

+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
  uint32_t keybundle_size_per_input =
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
      (glwe_dimension + 1) * (polynomial_size / 2);
@@ -282,7 +281,7 @@ __host__ void execute_cg_external_product_loop(
  auto global_accumulator = buffer->global_accumulator;
  auto buffer_fft = buffer->global_accumulator_fft;

-  void *kernel_args[21];
+  void *kernel_args[20];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -302,7 +301,6 @@ __host__ void execute_cg_external_product_loop(
  kernel_args[16] = &chunk_size;
  kernel_args[17] = &keybundle_size_per_input;
  kernel_args[18] = &d_mem;
-  kernel_args[20] = &gpu_offset;

  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
  dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -328,21 +326,17 @@ __host__ void execute_cg_external_product_loop(
  }
 }

-template <typename Torus, typename STorus, class params>
+template <typename Torus, class params>
 __host__ void host_cg_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
    Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
-  cudaSetDevice(gpu_index);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

-  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-        gpu_index, num_samples, polynomial_size, max_shared_memory);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, num_samples, polynomial_size);

  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {
@@ -351,24 +345,21 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
    execute_compute_keybundle<Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset, gpu_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);

    // Accumulate
    execute_cg_external_product_loop<Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size,
-        max_shared_memory, lwe_offset, gpu_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
  }
 }

 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int level_count, int num_samples) {

  // If Cooperative Groups is not supported, no need to check anything else
  if (!cuda_check_support_cooperative_groups())
@@ -388,6 +379,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
  int max_active_blocks_per_sm;

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < partial_sm_cg_accumulate) {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
@@ -418,37 +410,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
 // group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
  switch (polynomial_size) {
  case 256:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
  case 512:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
  case 1024:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
  case 2048:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
  case 4096:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
  case 8192:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
  case 16384:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                       num_samples);
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
          "N's are powers of two"
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -8,54 +8,46 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   uint32_t max_shared_memory) {
+                                                   uint32_t num_samples) {
  return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
-      glwe_dimension, polynomial_size, level_count, num_samples,
-      max_shared_memory);
+      glwe_dimension, polynomial_size, level_count, num_samples);
 }

 template <typename Torus>
-bool has_support_to_cuda_programmable_bootstrap_tbc(
-    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
+                                                    uint32_t glwe_dimension,
+                                                    uint32_t polynomial_size,
+                                                    uint32_t level_count) {
 #if CUDA_ARCH >= 900
  switch (polynomial_size) {
  case 256:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
  case 512:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
  case 1024:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 2048:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 4096:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 8192:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 16384:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+                                       polynomial_size, level_count);
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
          "N's are powers of two"
@@ -67,55 +59,54 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
 }

 #if (CUDA_ARCH >= 900)
-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 512:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 1024:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 2048:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 4096:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 8192:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 16384:
-    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
+    scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -131,8 +122,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
@@ -140,56 +130,49 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
    host_programmable_bootstrap_tbc<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
    host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 8192:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 16384:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -204,69 +187,68 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 */
 uint64_t get_buffer_size_programmable_bootstrap_64(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {

  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory))
+          input_lwe_ciphertext_count))
    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
+        input_lwe_ciphertext_count);
  else
    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
+        input_lwe_ciphertext_count);
 }

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<256>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 512:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<512>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 1024:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<1024>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 2048:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<2048>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 4096:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<4096>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 8192:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<8192>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  case 16384:
-    scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<16384>>(
+    scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -275,55 +257,54 @@ void scratch_cuda_programmable_bootstrap_cg(
  }
 }

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<256>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 512:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<512>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 1024:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<1024>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 2048:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<2048>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 4096:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<4096>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 8192:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<8192>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  case 16384:
-    scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<16384>>(
+    scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -341,31 +322,30 @@ void scratch_cuda_programmable_bootstrap(
 void scratch_cuda_programmable_bootstrap_32(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

 #if (CUDA_ARCH >= 900)
  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, max_shared_memory))
-    scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
+          level_count))
+    scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
              glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count, max_shared_memory))
-    scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
+              input_lwe_ciphertext_count))
+    scratch_cuda_programmable_bootstrap_cg<uint32_t>(
        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
-    scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
+    scratch_cuda_programmable_bootstrap<uint32_t>(
        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
 }

 /*
@@ -376,31 +356,30 @@ void scratch_cuda_programmable_bootstrap_32(
 void scratch_cuda_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

 #if (CUDA_ARCH >= 900)
  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, max_shared_memory))
-    scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
+          level_count))
+    scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
      if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
              glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count, max_shared_memory))
-    scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
+              input_lwe_ciphertext_count))
+    scratch_cuda_programmable_bootstrap_cg<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
-    scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
+    scratch_cuda_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
 }

 template <typename Torus>
@@ -410,8 +389,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
@@ -419,56 +397,49 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
    host_programmable_bootstrap_cg<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
    host_programmable_bootstrap_cg<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 8192:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 16384:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -484,8 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
@@ -493,56 +463,49 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
    host_programmable_bootstrap<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
    host_programmable_bootstrap<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
    host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
    host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 8192:
    host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case 16384:
    host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -559,15 +522,14 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_samples) {

  if (base_log > 32)
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
          "in the ciphertext representation (32)");

-  pbs_buffer<uint64_t, CLASSICAL> *buffer =
-      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
+  pbs_buffer<uint32_t, CLASSICAL> *buffer =
+      (pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;

  switch (buffer->pbs_variant) {
  case TBC:
@@ -579,14 +541,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lut_vector_indexes),
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+    break;
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
-    break;
  case CG:
    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
@@ -595,10 +555,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lut_vector_indexes),
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -608,10 +566,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lut_vector_indexes),
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  default:
    PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -654,17 +610,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
 * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
 * - level_count: number of decomposition levels in the gadget matrix (~4)
 * - num_samples: number of encrypted input messages
- * - num_luts: parameter to set the actual number of luts to be
- * used
- * - lwe_idx: the index of the LWE input to consider for the GPU of index
- * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
- * the input LWE array is copied to each GPU, but the whole LUT array is copied
- * (because the case when the number of LUTs is smaller than the number of input
- * LWEs is not trivial to take into account in the data repartition on the
- * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
- * input in the LUT array `lut_vector`.
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
 *
 * This function calls a wrapper to a device kernel that performs the
 * bootstrapping:
@@ -696,8 +641,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_samples) {
  if (base_log > 64)
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
          "in the ciphertext representation (64)");
@@ -715,14 +659,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lut_vector_indexes),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+    break;
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
-    break;
  case PBS_VARIANT::CG:
    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
@@ -731,10 +673,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lut_vector_indexes),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  case PBS_VARIANT::DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -744,10 +684,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lut_vector_indexes),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
  default:
    PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -760,14 +698,13 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
 */
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **buffer) {
-  cudaSetDevice(gpu_index);
  auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
 }

 template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);

 template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -776,8 +713,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t level_count, uint32_t num_samples);

 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -786,21 +722,18 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t level_count, uint32_t num_samples);

-template void scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
+template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-template void scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
+template void scratch_cuda_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -809,8 +742,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t level_count, uint32_t num_samples);

 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -819,28 +751,25 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t level_count, uint32_t num_samples);

-template void scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
+template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
+template void scratch_cuda_programmable_bootstrap<uint32_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);
 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);

 #if CUDA_ARCH >= 900
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
@@ -850,8 +779,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t level_count, uint32_t num_samples);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -859,18 +787,15 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
-template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
+    uint32_t level_count, uint32_t num_samples);
+template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+template void scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -17,13 +17,17 @@
 #include "types/complex/operations.cuh"

 template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_programmable_bootstrap_step_one(
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_programmable_bootstrap_step_one(
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const Torus *__restrict__ lwe_array_in,
+        const Torus *__restrict__ lwe_input_indexes,
+        const double2 *__restrict__ bootstrapping_key,
+        Torus *global_accumulator, double2 *global_accumulator_fft,
+        uint32_t lwe_iteration, uint32_t lwe_dimension,
+        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+        int8_t *device_mem, uint64_t device_memory_size_per_block) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -50,12 +54,12 @@ __global__ void device_programmable_bootstrap_step_one(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];

-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];

  Torus *global_slice =
      global_accumulator +
@@ -71,8 +75,8 @@ __global__ void device_programmable_bootstrap_step_one(
    // First iteration
    // Put "b" in [0, 2N[
    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);
    // The y-dimension is used to select the element of the GLWE this block will
    // compute
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
@@ -90,8 +94,8 @@ __global__ void device_programmable_bootstrap_step_one(

  // Put "a" in [0, 2N[
  Torus a_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
-                        2 * params::degree); // 2 * params::log2_degree + 1);
+  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
+                 params::log2_degree + 1); // 2 * params::log2_degree + 1);

  synchronize_threads_in_block();

@@ -128,13 +132,16 @@ __global__ void device_programmable_bootstrap_step_one(
 }

 template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_programmable_bootstrap_step_two(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, double2 *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_programmable_bootstrap_step_two(
+        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const double2 *__restrict__ bootstrapping_key,
+        Torus *global_accumulator, double2 *global_accumulator_fft,
+        uint32_t lwe_iteration, uint32_t lwe_dimension,
+        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+        int8_t *device_mem, uint64_t device_memory_size_per_block) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -200,7 +207,7 @@ __global__ void device_programmable_bootstrap_step_two(
  if (lwe_iteration + 1 == lwe_dimension) {
    // Last iteration
    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
                           (glwe_dimension * polynomial_size + 1) +
                       blockIdx.y * polynomial_size];

@@ -223,9 +230,9 @@ __global__ void device_programmable_bootstrap_step_two(
 }

 template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
+uint64_t get_buffer_size_programmable_bootstrap(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {

  uint64_t full_sm_step_one =
      get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
@@ -241,6 +248,7 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
  uint64_t full_dm = full_sm_step_one;

  uint64_t device_mem = 0;
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < partial_sm) {
    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
                 (glwe_dimension + 1);
@@ -263,15 +271,13 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
  return buffer_size + buffer_size % sizeof(double2);
 }

-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  uint64_t full_sm_step_one =
      get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
          polynomial_size);
@@ -281,6 +287,8 @@ __host__ void scratch_programmable_bootstrap(
  uint64_t partial_sm =
      get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
  // Configure step one
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
    check_cuda_error(cudaFuncSetAttribute(
@@ -333,10 +341,10 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
                 uint32_t glwe_dimension, uint32_t polynomial_size,
                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-                 uint32_t max_shared_memory, int lwe_iteration,
-                 uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
-                 uint64_t full_dm, uint32_t gpu_offset) {
+                 int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
+                 uint64_t full_sm, uint64_t full_dm) {

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  cudaSetDevice(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
@@ -347,21 +355,21 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm, gpu_offset);
+            level_count, d_mem, full_dm);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm, gpu_offset);
+            level_count, d_mem, partial_dm);
  } else {
    device_programmable_bootstrap_step_one<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0, gpu_offset);
+            level_count, d_mem, 0);
  }
  check_cuda_error(cudaGetLastError());
 }
@@ -375,10 +383,10 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
                 uint32_t glwe_dimension, uint32_t polynomial_size,
                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-                 uint32_t max_shared_memory, int lwe_iteration,
-                 uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
-                 uint64_t full_dm, uint32_t gpu_offset) {
+                 int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
+                 uint64_t full_sm, uint64_t full_dm) {

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  cudaSetDevice(gpu_index);
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
@@ -389,21 +397,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm, gpu_offset);
+            level_count, d_mem, full_dm);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm, gpu_offset);
+            level_count, d_mem, partial_dm);
  } else {
    device_programmable_bootstrap_step_two<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0, gpu_offset);
+            level_count, d_mem, 0);
  }
  check_cuda_error(cudaGetLastError());
 }
@@ -417,8 +425,7 @@ __host__ void host_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
  cudaSetDevice(gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
@@ -447,16 +454,14 @@ __host__ void host_programmable_bootstrap(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, global_accumulator,
        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one,
-        full_dm_step_one, gpu_offset);
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
    execute_step_two<Torus, params>(
        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
        lut_vector_indexes, bootstrapping_key, global_accumulator,
        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
-        full_dm_step_two, gpu_offset);
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -9,53 +9,45 @@

 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t num_samples) {
  return supports_cooperative_groups_on_multibit_programmable_bootstrap<
-      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
-                max_shared_memory);
+      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples);
 }

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+    uint32_t level_count) {
 #if CUDA_ARCH >= 900
  switch (polynomial_size) {
  case 256:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
  case 512:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
  case 1024:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 2048:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 4096:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 8192:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
  case 16384:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+                                       polynomial_size, level_count);
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
          "N's are powers of two"
@@ -73,9 +65,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -83,74 +73,60 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t,
-                                             AmortizedDegree<256>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 512:
-    host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                             AmortizedDegree<512>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 1024:
-    host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                             AmortizedDegree<1024>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 2048:
-    host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                             AmortizedDegree<2048>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 4096:
-    host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                             AmortizedDegree<4096>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 8192:
-    host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                             AmortizedDegree<8192>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 16384:
-    host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                             AmortizedDegree<16384>>(
+    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -166,9 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -176,73 +150,60 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_multi_bit_programmable_bootstrap<uint64_t, int64_t,
-                                          AmortizedDegree<256>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 512:
-    host_multi_bit_programmable_bootstrap<Torus, int64_t, AmortizedDegree<512>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 1024:
-    host_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                          AmortizedDegree<1024>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 2048:
-    host_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                          AmortizedDegree<2048>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 4096:
-    host_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                          AmortizedDegree<4096>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 8192:
-    host_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                          AmortizedDegree<8192>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 16384:
-    host_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                          AmortizedDegree<16384>>(
+    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -257,9 +218,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
-    uint32_t lwe_chunk_size) {
+    uint32_t level_count, uint32_t num_samples) {

  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -274,15 +233,13 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lut_vector_indexes),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<uint64_t *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
+    break;
 #else
    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
 #endif
-    break;
  case PBS_VARIANT::CG:
    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
@@ -293,8 +250,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case PBS_VARIANT::DEFAULT:
    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -306,70 +262,61 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
  }
 }

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<256>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  case 512:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<512>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  case 1024:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<1024>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  case 2048:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<2048>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  case 4096:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<4096>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  case 8192:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<8192>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  case 16384:
-    scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                AmortizedDegree<16384>>(
+    scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -378,70 +325,55 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
  }
 }

-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<256>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 512:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<512>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 1024:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<1024>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 2048:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<2048>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 4096:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<4096>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 8192:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<8192>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 16384:
-    scratch_multi_bit_programmable_bootstrap<Torus, STorus,
-                                             AmortizedDegree<16384>>(
+    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -454,40 +386,35 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size) {
+    bool allocate_gpu_memory) {

 #if (CUDA_ARCH >= 900)
  if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, max_shared_memory))
-    scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+          level_count))
+    scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
      if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
              uint64_t>(glwe_dimension, polynomial_size, level_count,
-                        input_lwe_ciphertext_count, max_shared_memory))
-    scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+                        input_lwe_ciphertext_count))
+    scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
-        lwe_chunk_size);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
-    scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+    scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
 }

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **buffer) {
-  cudaSetDevice(gpu_index);
  auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
@@ -504,15 +431,15 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
 */
 template <typename Torus, class params>
-__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                                     uint32_t polynomial_size,
-                                     uint32_t max_shared_memory) {
+uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                            uint32_t polynomial_size) {

  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
          polynomial_size);

  int max_blocks_per_sm;
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < full_sm_keybundle)
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_blocks_per_sm,
@@ -557,13 +484,12 @@ __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
  return divisor;
 }

-template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size);
+    bool allocate_gpu_memory);

 template void
 cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -573,17 +499,13 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

-template void
-scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
 cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -593,80 +515,63 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);

 #if (CUDA_ARCH >= 900)
-template <typename Torus, typename STorus>
+template <typename Torus>
 void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<256>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 512:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<512>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 1024:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<1024>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 2048:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<2048>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 4096:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<4096>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 8192:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<8192>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  case 16384:
-    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
-                                                 AmortizedDegree<16384>>(
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -681,9 +586,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -691,74 +594,60 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
-                                              AmortizedDegree<256>>(
+    host_tbc_multi_bit_programmable_bootstrap<uint64_t, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 512:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                              AmortizedDegree<512>>(
+    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 1024:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                              AmortizedDegree<1024>>(
+    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 2048:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                              AmortizedDegree<2048>>(
+    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 4096:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                              AmortizedDegree<4096>>(
+    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 8192:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                              AmortizedDegree<8192>>(
+    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  case 16384:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
-                                              AmortizedDegree<16384>>(
+    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -767,13 +656,11 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
  }
 }

-template void
-scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
 cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -783,7 +670,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,7 +18,7 @@
 #include <vector>

 template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
+__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
                                            uint32_t ggsw_idx,
                                            uint32_t grouping_factor) {
  Torus x = 0;
@@ -28,18 +28,101 @@ __device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
    x += selection_bit * lwe_array_group[i];
  }

-  return rescale_torus_element(
-      x, 2 * params::degree); // 2 * params::log2_degree + 1);
+  return modulus_switch(x, params::log2_degree + 1);
+}
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__device__ void compute_multi_bit_programmable_bootstrap_keybundle(
+        const Torus *__restrict__ lwe_in,
+        double2 *__restrict__ keybundle,
+    const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t level_count, uint32_t lwe_chunk_size,
+    uint32_t level_id, uint32_t glwe_id, uint32_t poly_id, uint32_t chunk_id, uint32_t lwe_iteration, Torus *accumulator){
+
+    ////////////////////////////////////////////////////////////
+    // Computes all keybundles
+    uint32_t rev_lwe_iteration =
+        ((lwe_dimension / grouping_factor) - lwe_iteration - 1);
+
+    // ////////////////////////////////
+    // Keygen guarantees the first term is a constant term of the polynomial, no
+    // polynomial multiplication required
+    const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
+        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
+    const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        bsk_poly, accumulator);
+
+    // Accumulate the other terms
+    for (int g = 1; g < (1 << grouping_factor); g++) {
+
+      const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
+          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
+      const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+
+      // Calculates the monomial degree
+      const Torus *lwe_array_group =
+          lwe_in + rev_lwe_iteration * grouping_factor;
+      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, g, grouping_factor);
+
+      synchronize_threads_in_block();
+      // Multiply by the bsk element
+      polynomial_product_accumulate_by_monomial<Torus, params>(
+          accumulator, bsk_poly, monomial_degree, false);
+    }
+
+    synchronize_threads_in_block();
+
+    // Move accumulator to local memory
+    double2 temp[params::opt / 2];
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
+      temp[i].y =
+          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
+      temp[i].x /= (double)std::numeric_limits<Torus>::max();
+      temp[i].y /= (double)std::numeric_limits<Torus>::max();
+      tid += params::degree / params::opt;
+    }
+
+    synchronize_threads_in_block();
+    // Move from local memory back to shared memory but as complex
+    tid = threadIdx.x;
+    double2 *fft = (double2 *)accumulator;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      fft[tid] = temp[i];
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    NSMFFT_direct<HalfDegree<params>>(fft);
+
+    // lwe iteration
+    auto keybundle_out = get_ith_mask_kth_block(
+        keybundle, chunk_id, glwe_id, level_id,
+        polynomial_size, glwe_dimension, level_count);
+    auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
+
+    copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
+        fft, keybundle_poly);
+
 }

 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
-    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array,
-    Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
+    const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
    uint32_t keybundle_size_per_input, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {

  extern __shared__ int8_t sharedmem[];
  int8_t *selected_memory = sharedmem;
@@ -58,102 +141,47 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
  uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
  uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
  uint32_t input_idx = blockIdx.x / lwe_chunk_size;
+  uint32_t chunk_id = blockIdx.x % lwe_chunk_size;

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
    //
    Torus *accumulator = (Torus *)selected_memory;

-    Torus *block_lwe_array_in =
-        &lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] *
-                      (lwe_dimension + 1)];
+    const Torus *block_lwe_array_in =
+        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];

    double2 *keybundle = keybundle_array +
                         // select the input
                         input_idx * keybundle_size_per_input;

-    ////////////////////////////////////////////////////////////
-    // Computes all keybundles
-    uint32_t rev_lwe_iteration =
-        ((lwe_dimension / grouping_factor) - lwe_iteration - 1);
-
-    // ////////////////////////////////
-    // Keygen guarantees the first term is a constant term of the polynomial, no
-    // polynomial multiplication required
-    Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
-        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    Torus *bsk_poly = bsk_slice + poly_id * params::degree;
-
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        bsk_poly, accumulator);
-
-    // Accumulate the other terms
-    for (int g = 1; g < (1 << grouping_factor); g++) {
-
-      Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
-          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      Torus *bsk_poly = bsk_slice + poly_id * params::degree;
-
-      // Calculates the monomial degree
-      Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, g, grouping_factor);
-
-      synchronize_threads_in_block();
-      // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
-          accumulator, bsk_poly, monomial_degree, false);
-    }
-
-    synchronize_threads_in_block();
-
-    double2 *fft = (double2 *)selected_memory;
-
-    // Move accumulator to local memory
-    double2 temp[params::opt / 2];
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
-      temp[i].y =
-          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
-      temp[i].x /= (double)std::numeric_limits<Torus>::max();
-      temp[i].y /= (double)std::numeric_limits<Torus>::max();
-      tid += params::degree / params::opt;
-    }
-
-    synchronize_threads_in_block();
-    // Move from local memory back to shared memory but as complex
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] = temp[i];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-    NSMFFT_direct<HalfDegree<params>>(fft);
-
-    // lwe iteration
-    auto keybundle_out = get_ith_mask_kth_block(
-        keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
-        polynomial_size, glwe_dimension, level_count);
-    auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
-
-    copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
-        fft, keybundle_poly);
+      compute_multi_bit_programmable_bootstrap_keybundle<Torus, params, SMD>(block_lwe_array_in,
+                                                                             keybundle,
+                                                                             bootstrapping_key,
+                                                                             lwe_dimension,
+                                                                             glwe_dimension,
+                                                                             polynomial_size,
+                                                                             grouping_factor,
+                                                                             level_count,
+                                                                             lwe_chunk_size,
+                                                                             level_id, glwe_id,
+                                                                             poly_id,
+                                                                             chunk_id,
+                                                                             lwe_iteration,
+                                                                             accumulator);
  }
 }

 template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
-    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *global_accumulator,
-    double2 *global_accumulator_fft, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_accumulate_step_one(
+        const Torus *__restrict__ lwe_array_in,
+        const Torus *__restrict__ lwe_input_indexes,
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
+        double2 *global_accumulator_fft, uint32_t lwe_dimension,
+        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
+        uint64_t device_memory_size_per_block) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -179,12 +207,12 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
  if constexpr (SMD == PARTIALSM)
    accumulator_fft = (double2 *)sharedmem;

-  Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];

-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];

  Torus *global_slice =
      global_accumulator +
@@ -202,8 +230,8 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
    // Initializes the accumulator with the body of LWE
    // Put "b" in [0, 2N[
    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);

    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
@@ -242,13 +270,15 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
 }

 template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor, uint32_t iteration,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_accumulate_step_two(
+        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+        const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
+        double2 *global_accumulator_fft, uint32_t lwe_dimension,
+        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+        uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
+        uint32_t lwe_chunk_size, int8_t *device_mem,
+        uint64_t device_memory_size_per_block) {
  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
  // much faster than global memory
@@ -268,11 +298,11 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
  double2 *accumulator_fft = (double2 *)selected_memory;

  //
-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.x * lwe_chunk_size * level_count *
-                           (glwe_dimension + 1) * (glwe_dimension + 1) *
-                           (polynomial_size / 2);
+  const double2 *keybundle = keybundle_array +
+                             // select the input
+                             blockIdx.x * lwe_chunk_size * level_count *
+                                 (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                 (polynomial_size / 2);

  double2 *global_accumulator_fft_input =
      global_accumulator_fft +
@@ -312,7 +342,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
  if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
    // Last iteration
    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
                           (glwe_dimension * polynomial_size + 1) +
                       blockIdx.y * polynomial_size];

@@ -327,58 +357,35 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
  }
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size; // accumulator
+  return sizeof(double2) * polynomial_size / 2; // accumulator
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size * 2; // accumulator
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
+uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size; // accumulator
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size; // accumulator
 }

-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_multibit_programmable_bootstrap(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
-
-  uint64_t buffer_size = 0;
-  buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
-                 (glwe_dimension + 1) * (glwe_dimension + 1) *
-                 (polynomial_size / 2) * sizeof(double2); // keybundle fft
-  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
-                 level_count * (polynomial_size / 2) *
-                 sizeof(double2); // global_accumulator_fft
-  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
-                 polynomial_size * sizeof(Torus); // global_accumulator
-
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0) {
+    bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
          polynomial_size);
@@ -469,10 +476,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  if (!lwe_chunk_size)
-    lwe_chunk_size =
-        get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                          polynomial_size, max_shared_memory);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size);
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
@@ -486,10 +491,8 @@ __host__ void execute_compute_keybundle(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset,
-    uint32_t gpu_offset) {
+    uint32_t lwe_chunk_size, int lwe_offset) {

-  cudaSetDevice(gpu_index);
  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);

@@ -500,6 +503,8 @@ __host__ void execute_compute_keybundle(
  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
          polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);

  auto d_mem = buffer->d_mem_keybundle;
  auto keybundle_fft = buffer->keybundle_fft;
@@ -514,15 +519,15 @@ __host__ void execute_compute_keybundle(
        <<<grid_keybundle, thds, 0, stream>>>(
            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-            base_log, level_count, lwe_offset, chunk_size,
-            keybundle_size_per_input, d_mem, full_sm_keybundle, gpu_offset);
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, full_sm_keybundle);
  else
    device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
        <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-            base_log, level_count, lwe_offset, chunk_size,
-            keybundle_size_per_input, d_mem, 0, gpu_offset);
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, 0);
  check_cuda_error(cudaGetLastError());
 }

@@ -534,16 +539,16 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
                               uint32_t num_samples, uint32_t lwe_dimension,
                               uint32_t glwe_dimension,
                               uint32_t polynomial_size, uint32_t base_log,
-                               uint32_t level_count, uint32_t max_shared_memory,
-                               int j, int lwe_offset, uint32_t gpu_offset) {
+                               uint32_t level_count, int j, int lwe_offset) {

-  cudaSetDevice(gpu_index);
  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
          polynomial_size);
  uint64_t partial_sm_accumulate_step_one =
      get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
          Torus>(polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);

  //
  auto d_mem = buffer->d_mem_acc_step_one;
@@ -560,7 +565,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
            lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
            global_accumulator, global_accumulator_fft, lwe_dimension,
            glwe_dimension, polynomial_size, base_log, level_count,
-            j + lwe_offset, d_mem, full_sm_accumulate_step_one, gpu_offset);
+            j + lwe_offset, d_mem, full_sm_accumulate_step_one);
  else if (max_shared_memory < full_sm_accumulate_step_one)
    device_multi_bit_programmable_bootstrap_accumulate_step_one<Torus, params,
                                                                PARTIALSM>
@@ -569,7 +574,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
                     lut_vector_indexes, global_accumulator,
                     global_accumulator_fft, lwe_dimension, glwe_dimension,
                     polynomial_size, base_log, level_count, j + lwe_offset,
-                     d_mem, partial_sm_accumulate_step_one, gpu_offset);
+                     d_mem, partial_sm_accumulate_step_one);
  else
    device_multi_bit_programmable_bootstrap_accumulate_step_one<Torus, params,
                                                                FULLSM>
@@ -578,24 +583,25 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
                     lut_vector_indexes, global_accumulator,
                     global_accumulator_fft, lwe_dimension, glwe_dimension,
                     polynomial_size, base_log, level_count, j + lwe_offset,
-                     d_mem, 0, gpu_offset);
+                     d_mem, 0);
  check_cuda_error(cudaGetLastError());
 }

 template <typename Torus, class params>
-__host__ void
-execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes,
-                 pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
-                 uint32_t lwe_dimension, uint32_t glwe_dimension,
-                 uint32_t polynomial_size, int32_t grouping_factor,
-                 uint32_t level_count, uint32_t max_shared_memory, int j,
-                 int lwe_offset, uint32_t lwe_chunk_size, uint32_t gpu_offset) {
+__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
+                               Torus *lwe_array_out, Torus *lwe_output_indexes,
+                               pbs_buffer<Torus, MULTI_BIT> *buffer,
+                               uint32_t num_samples, uint32_t lwe_dimension,
+                               uint32_t glwe_dimension,
+                               uint32_t polynomial_size,
+                               int32_t grouping_factor, uint32_t level_count,
+                               int j, int lwe_offset, uint32_t lwe_chunk_size) {

-  cudaSetDevice(gpu_index);
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
          polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);

  auto d_mem = buffer->d_mem_acc_step_two;
  auto keybundle_fft = buffer->keybundle_fft;
@@ -612,8 +618,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
            lwe_array_out, lwe_output_indexes, keybundle_fft,
            global_accumulator, global_accumulator_fft, lwe_dimension,
            glwe_dimension, polynomial_size, level_count, grouping_factor, j,
-            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
-            gpu_offset);
+            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two);
  else
    device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
                                                                FULLSM>
@@ -621,27 +626,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
           stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
                     global_accumulator, global_accumulator_fft, lwe_dimension,
                     glwe_dimension, polynomial_size, level_count,
-                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
-                     gpu_offset);
+                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0);
  check_cuda_error(cudaGetLastError());
 }

-template <typename Torus, typename STorus, class params>
+template <typename Torus, class params>
 __host__ void host_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
-  cudaSetDevice(gpu_index);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

-  // If a chunk size is not passed to this function, select one.
-  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-        gpu_index, num_samples, polynomial_size, max_shared_memory);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, num_samples, polynomial_size);

  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {
@@ -650,8 +649,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
    execute_compute_keybundle<Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset, gpu_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
    // Accumulate
    uint32_t chunk_size = std::min(
        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
@@ -659,14 +657,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
      execute_step_one<Torus, params>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, max_shared_memory, j,
-          lwe_offset, gpu_offset);
+          polynomial_size, base_log, level_count, j, lwe_offset);

      execute_step_two<Torus, params>(
          stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
          num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, max_shared_memory, j, lwe_offset,
-          lwe_chunk_size, gpu_offset);
+          grouping_factor, level_count, j, lwe_offset, lwe_chunk_size);
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -36,12 +36,15 @@ namespace cg = cooperative_groups;
 */
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_tbc(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    int8_t *device_mem, uint64_t device_memory_size_per_block, bool support_dsm,
-    uint32_t gpu_offset) {
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *device_mem,
+    uint64_t device_memory_size_per_block, bool support_dsm) {

  cluster_group cluster = this_cluster();

@@ -77,12 +80,12 @@ __global__ void device_programmable_bootstrap_tbc(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];

-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];

  double2 *block_join_buffer =
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
@@ -93,8 +96,8 @@ __global__ void device_programmable_bootstrap_tbc(

  // Put "b" in [0, 2N[
  Torus b_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                        2 * params::degree);
+  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                 params::log2_degree + 1);

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
@@ -106,8 +109,8 @@ __global__ void device_programmable_bootstrap_tbc(

    // Put "a" in [0, 2N[
    Torus a_hat = 0;
-    rescale_torus_element(block_lwe_array_in[i], a_hat,
-                          2 * params::degree); // 2 * params::log2_degree + 1);
+    modulus_switch(block_lwe_array_in[i], a_hat,
+                   params::log2_degree + 1); // 2 * params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
@@ -143,7 +146,7 @@ __global__ void device_programmable_bootstrap_tbc(
  }

  auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                         (glwe_dimension * polynomial_size + 1) +
                     blockIdx.y * polynomial_size];

@@ -157,18 +160,16 @@ __global__ void device_programmable_bootstrap_tbc(
  }
 }

-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_programmable_bootstrap_tbc(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  bool supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);

  uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
      polynomial_size);
@@ -180,6 +181,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
    minimum_sm_tbc =
        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
            polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);

  if (max_shared_memory >= full_sm + minimum_sm_tbc) {
    check_cuda_error(cudaFuncSetAttribute(
@@ -223,13 +225,11 @@ __host__ void host_programmable_bootstrap_tbc(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
-  cudaSetDevice(gpu_index);
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {

  auto supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
@@ -244,6 +244,9 @@ __host__ void host_programmable_bootstrap_tbc(
        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
            polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
  uint64_t full_dm = full_sm;

  uint64_t partial_dm = full_dm - partial_sm;
@@ -278,7 +281,7 @@ __host__ void host_programmable_bootstrap_tbc(
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
-        supports_dsm, gpu_offset));
+        supports_dsm));
  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
    config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;

@@ -287,7 +290,7 @@ __host__ void host_programmable_bootstrap_tbc(
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        partial_dm, supports_dsm, gpu_offset));
+        partial_dm, supports_dsm));
  } else {
    config.dynamicSmemBytes = full_sm + minimum_sm_tbc;

@@ -296,15 +299,14 @@ __host__ void host_programmable_bootstrap_tbc(
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
-        supports_dsm, gpu_offset));
+        supports_dsm));
  }
 }

 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int level_count, int num_samples) {

  // If Cooperative Groups is not supported, no need to check anything else
  if (!cuda_check_support_cooperative_groups())
@@ -318,12 +320,12 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
      get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
          params::degree);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  int thds = params::degree / params::opt;

  // Get the maximum number of active blocks per streaming multiprocessors
  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
  int max_active_blocks_per_sm;
-
  if (max_shared_memory < partial_sm) {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
@@ -348,13 +350,13 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
 }

 template <typename Torus>
-__host__ bool
-supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory) {
+bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
+    uint32_t polynomial_size) {
  uint64_t minimum_sm =
      get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
          polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < minimum_sm) {
    // If we cannot store a single polynomial in a block shared memory we cannot
    // use TBC
@@ -367,7 +369,7 @@ supports_distributed_shared_memory_on_classic_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+    uint32_t level_count) {

  if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
    return false;
@@ -379,7 +381,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
          polynomial_size);
  uint64_t minimum_sm_tbc = 0;
  if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory))
+          Torus>(polynomial_size))
    minimum_sm_tbc =
        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
            polynomial_size);
@@ -402,6 +404,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
   * case and it will fail if we try. Thus, since level_count *
   * (glwe_dimension+1) is usually smaller than 8 at this moment, we will
   * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory < partial_sm + minimum_sm_tbc) {
    check_cuda_error(cudaFuncSetAttribute(
        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -18,16 +18,21 @@
 #include <vector>

 template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size,
-    uint32_t keybundle_size_per_input, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, bool support_dsm,
-    uint32_t gpu_offset) {
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_tbc(
+        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const Torus *__restrict__ lwe_array_in,
+        const Torus *__restrict__ lwe_input_indexes,
+        const Torus *__restrict__ bootstrapping_key,
+        double2 *__restrict__ keybundle_array, double2 *join_buffer,
+        Torus *global_accumulator, uint32_t lwe_dimension,
+        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, uint32_t grouping_factor,
+         uint32_t keybundle_size_per_input,
+        int8_t *device_mem, uint64_t device_memory_size_per_block,
+        bool support_dsm) {

  cluster_group cluster = this_cluster();

@@ -49,7 +54,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
    selected_memory = &device_mem[block_index * device_memory_size_per_block];
  }

-  Torus *accumulator = (Torus *)selected_memory;
+  Torus *keybundle_accumulator = (Torus *)selected_memory;
+  Torus *accumulator = keybundle_accumulator + polynomial_size;
  double2 *accumulator_fft =
      (double2 *)accumulator +
      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
@@ -62,12 +68,12 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+  const Torus *__restrict__ block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];

-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];

  double2 *block_join_buffer =
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
@@ -77,27 +83,21 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
      global_accumulator +
      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;

-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.z * keybundle_size_per_input;
+  double2 *__restrict__ keybundle = keybundle_array +
+                             // select the input
+                             blockIdx.z * keybundle_size_per_input;

-  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);

    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
        false);
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
-  }

-  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
+  for (int i = 0; i < lwe_dimension / grouping_factor; i++) {
    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
@@ -115,17 +115,28 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
    // don't modify the same memory space at the same time
    synchronize_threads_in_block();

+    // Computes keybundle
+    for(int poly_id = 0; poly_id < glwe_dimension+1; poly_id++){
+    compute_multi_bit_programmable_bootstrap_keybundle<Torus, params, SMD>(
+            block_lwe_array_in,
+            keybundle,
+            bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, (uint32_t)1,
+            (uint32_t)blockIdx.x,(uint32_t)blockIdx.y, (uint32_t)poly_id, (uint32_t)0, (uint32_t)i, keybundle_accumulator);
+            cluster.sync();    synchronize_threads_in_block();
+
+    }
+
    // Perform G^-1(ACC) * GGSW -> GLWE
    mul_ggsw_glwe<Torus, cluster_group, params>(
        accumulator, accumulator_fft, block_join_buffer, keybundle,
-        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
+        polynomial_size, glwe_dimension, level_count, 0, cluster, support_dsm);

    synchronize_threads_in_block();
  }

-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                           (glwe_dimension * polynomial_size + 1) +
                       blockIdx.y * polynomial_size];

@@ -137,47 +148,37 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
    }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
-  }
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size) {
  return sizeof(Torus) * polynomial_size; // distributed shared memory
 }

 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size; // accumulator
+  return sizeof(Torus) * polynomial_size;  // accumulator
 }
 template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
+uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size * 2; // accumulator
+  return sizeof(Torus) * polynomial_size * 2+ // accumulator
+  sizeof(Torus) * polynomial_size; // keybundle accumulator
 }

-template <typename Torus, typename STorus, typename params>
+template <typename Torus, typename params>
 __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0) {
-
-  cudaSetDevice(gpu_index);
+    bool allocate_gpu_memory) {

  bool supports_dsm =
      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);

  uint64_t full_sm_keybundle =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -194,6 +195,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
            polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
  if (max_shared_memory < full_sm_keybundle) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -217,44 +220,42 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
  if (max_shared_memory <
      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        minimum_sm_tbc_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory <
             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }

-  if (!lwe_chunk_size)
-    lwe_chunk_size =
-        get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                          polynomial_size, max_shared_memory);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size);
  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
@@ -262,20 +263,17 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
 }

 template <typename Torus, class params>
-__host__ void execute_tbc_external_product_loop(
+__host__ void execute_tbc(
    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lwe_array_out, Torus *lwe_output_indexes,Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset,
-    uint32_t gpu_offset) {
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count) {

-  cudaSetDevice(gpu_index);
  auto supports_dsm =
      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);

  uint64_t full_dm =
      get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
@@ -289,12 +287,12 @@ __host__ void execute_tbc_external_product_loop(
        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
            polynomial_size);

-  uint32_t keybundle_size_per_input =
-      lwe_chunk_size * level_count * (glwe_dimension + 1) *
-      (glwe_dimension + 1) * (polynomial_size / 2);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);

-  uint32_t chunk_size =
-      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  uint32_t keybundle_size_per_input =
+      level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2);

  auto d_mem = buffer->d_mem_acc_tbc;
  auto keybundle_fft = buffer->keybundle_fft;
@@ -324,82 +322,67 @@ __host__ void execute_tbc_external_product_loop(
    config.dynamicSmemBytes = minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               NOSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
+        lwe_array_in, lwe_input_indexes, bootstrapping_key, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
-        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, full_dm, supports_dsm, gpu_offset));
+        base_log, level_count, grouping_factor, 
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm));
  } else if (max_shared_memory < full_dm + minimum_dm) {
    config.dynamicSmemBytes = partial_dm + minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               PARTIALSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
+        lwe_array_in, lwe_input_indexes, bootstrapping_key, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
-        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, partial_dm, supports_dsm, gpu_offset));
+        base_log, level_count, grouping_factor,
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm));
  } else {
    config.dynamicSmemBytes = full_dm + minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               FULLSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
+        lwe_array_in, lwe_input_indexes, bootstrapping_key, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
-        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, 0, supports_dsm, gpu_offset));
+        base_log, level_count, grouping_factor,
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm));
  }
 }

-template <typename Torus, typename STorus, class params>
+template <typename Torus, class params>
 __host__ void host_tbc_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
    Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
  cudaSetDevice(gpu_index);

-  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-        gpu_index, num_samples, polynomial_size, max_shared_memory);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, num_samples, polynomial_size);

-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset, gpu_offset);
-
-    // Accumulate
-    execute_tbc_external_product_loop<Torus, params>(
+    execute_tbc<Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+        lwe_input_indexes, lwe_array_out, lwe_output_indexes, bootstrapping_key,
+        buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size,
-        max_shared_memory, lwe_offset, gpu_offset);
-  }
+        grouping_factor, base_log, level_count);
 }

 template <typename Torus>
-__host__ bool
-supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory) {
+bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
+    uint32_t polynomial_size) {
  uint64_t minimum_sm =
      get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
          polynomial_size);

+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory <= minimum_sm) {
    // If we cannot store a single polynomial in a block shared memory we
    // cannot use TBC
@@ -412,7 +395,7 @@ supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+    uint32_t level_count) {

  if (!cuda_check_support_thread_block_clusters())
    return false;
@@ -425,7 +408,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
          polynomial_size);
  uint64_t minimum_sm_tbc_accumulate = 0;
  if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory))
+          Torus>(polynomial_size))
    minimum_sm_tbc_accumulate =
        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
            polynomial_size);
@@ -448,36 +431,37 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
   * case and it will fail if we try. Thus, since level_count *
   * (glwe_dimension+1) is usually smaller than 8 at this moment, we will
   * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  if (max_shared_memory <
      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               NOSM>,
        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
        &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               NOSM>,
        &config));
  } else if (max_shared_memory <
             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               PARTIALSM>,
        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
        &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               PARTIALSM>,
        &config));
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               FULLSM>,
        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
        &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+        device_multi_bit_programmable_bootstrap_tbc<Torus, params,
                                                               FULLSM>,
        &config));
  }
@@ -485,7 +469,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
  return cluster_size >= level_count * (glwe_dimension + 1);
 }

-template __host__ bool
+template bool
 supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
-    uint32_t polynomial_size, uint32_t max_shared_memory);
+    uint32_t polynomial_size);
 #endif // FASTMULTIBIT_PBS_H
--- a/Show More
+++ b/Show More