chore(zk): add a shortint wrapper for the CompactPkeCrs

chore(tests): make some test getters pub with a feature
chore(c_api): fix import in DynamicDistribution rust to c conversion
2026-01-11 15:48:20 -05:00 · 2024-12-12 15:46:01 +01:00 · 2024-12-12 15:46:01 +01:00 · 2024-12-12 15:46:01 +01:00 · 2024-12-12 15:46:01 +01:00 · 2024-12-12 13:50:40 +01:00
1280 changed files with 42288 additions and 14380 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -100,7 +100,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -54,22 +54,23 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-fft/**
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
            csprng:
-              - concrete-csprng/**
+              - tfhe-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
            versionable:
@@ -132,7 +133,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -158,14 +159,14 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

-      - name: Run concrete-csprng tests
+      - name: Run tfhe-csprng tests
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
-          make test_concrete_csprng
+          make test_tfhe_csprng

      - name: Run tfhe-zk-pok tests
        if: needs.should-run.outputs.zk_pok_test == 'true'
@@ -198,7 +199,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
+        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
        with:
          path: |
            ~/.nvm
@@ -211,7 +212,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
+        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -269,7 +270,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -42,22 +42,24 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-fft/**
              - tfhe-zk-pok/**
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
              - tfhe/src/integer/**
+              - .github/workflows/aws_tfhe_integer_tests.yml

  setup-instance:
    name: Setup instance (unsigned-integer-tests)
@@ -73,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +99,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -138,7 +140,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -42,22 +42,24 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-fft/**
              - tfhe-zk-pok/**
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
              - tfhe/src/integer/**
+              - .github/workflows/aws_tfhe_signed_integer_tests.yml

  setup-instance:
    name: Setup instance (unsigned-integer-tests)
@@ -73,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +99,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -142,7 +144,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -40,6 +40,9 @@ jobs:
      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.shortint_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
+      strings_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.strings_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
      high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.high_level_api_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -60,20 +63,21 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-fft/**
              - tfhe-zk-pok/**
            csprng:
-              - concrete-csprng/**
+              - tfhe-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
            core_crypto:
@@ -84,6 +88,11 @@ jobs:
            shortint:
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
+            strings:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+              - tfhe/src/strings/**
            high_level_api:
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
@@ -113,6 +122,7 @@ jobs:
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
+          steps.changed-files.outputs.strings_any_changed == 'true' ||
          steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
          steps.changed-files.outputs.c_api_any_changed == 'true' ||
          steps.changed-files.outputs.examples_any_changed == 'true' ||
@@ -132,7 +142,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -158,14 +168,14 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

-      - name: Run concrete-csprng tests
+      - name: Run tfhe-csprng tests
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
-          make test_concrete_csprng
+          make test_tfhe_csprng

      - name: Run tfhe-zk-pok tests
        if: needs.should-run.outputs.zk_pok_test == 'true'
@@ -202,6 +212,11 @@ jobs:
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_shortint_ci

+      - name: Run strings tests
+        if: needs.should-run.outputs.strings_test == 'true'
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_strings
+
      - name: Run high-level API tests
        if: needs.should-run.outputs.high_level_api_test == 'true'
        run: |
@@ -235,7 +250,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -61,7 +61,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
+        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
        with:
          path: |
            ~/.nvm
@@ -74,7 +74,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
+        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -119,7 +119,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -80,8 +80,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      - name: Measure key sizes
        run: |
@@ -128,7 +127,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,7 +58,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -78,8 +78,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --name-suffix avx512 \
-          --walk-subdirs \
-          --throughput
+          --walk-subdirs

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -116,7 +115,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -54,7 +54,7 @@ jobs:
          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -79,8 +79,7 @@ jobs:
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --throughput
+          --walk-subdirs

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -117,6 +116,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -127,7 +127,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -154,7 +154,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --throughput
+      

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,6 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
@@ -85,7 +84,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -126,8 +125,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --name-suffix avx512 \
-          --walk-subdirs \
-          --throughput
+          --walk-subdirs

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -169,7 +167,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -1,195 +1,41 @@
-# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: ERC20 GPU H100 benchmarks
+# Run CUDA ERC20 benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
+name: Cuda ERC20 benchmarks

 on:
  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 5a.m.
-    - cron: '0 5 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "multi-h100 (n3-H100x8)"

 jobs:
-  setup-instance:
-    name: Setup instance (cuda-erc20-benchmarks)
+  parse-inputs:
    runs-on: ubuntu-latest
-    if:  github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-erc20-benchmarks:
-    name: Execute GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
+      - name: Parse profile
+        id: parse_profile
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
+      - name: Parse hardware name
+        id: parse_hardware_name
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks
-        run: |
-          make bench_hlapi_erc20_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512
-
-      - name: Parse PBS counts
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
-          --append-results
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_erc20
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-erc20-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-erc20-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+  run-benchmarks:
+    name: Run benchmarks
+    needs: parse-inputs
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: ${{ needs.parse-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_erc20_8h100.yml
+++ b/.github/workflows/benchmark_gpu_erc20_8h100.yml
@@ -1,195 +0,0 @@
-# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: ERC20 GPU 8xH100 benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 5a.m.
-    - cron: '0 5 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-erc20-benchmarks)
-    runs-on: ubuntu-latest
-    if:  github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: multi-h100
-
-  cuda-erc20-benchmarks:
-    name: Execute GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks
-        run: |
-          make bench_hlapi_erc20_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x8" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512
-
-      - name: Parse PBS counts
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
-          --append-results
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_erc20
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-erc20-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
-          SLACK_MESSAGE: "ERC20 8xH100 benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-erc20-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -1,11 +1,35 @@
 # Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: ERC20 GPU 2xH100 benchmarks
+name: Cuda ERC20 benchmarks - common

 on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 5a.m.
-    - cron: '0 5 * * 6'
+  workflow_call:
+    inputs:
+      backend:
+        type: string
+        default: hyperstack
+      profile:
+        type: string
+        required: true
+      hardware_name:
+        type: string
+        required: true
+    secrets:
+      FHE_ACTIONS_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true

 env:
  CARGO_TERM_COLOR: always
@@ -30,17 +54,17 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: 2-h100
+          backend: ${{ inputs.backend }}
+          profile: ${{ inputs.profile }}

  cuda-erc20-benchmarks:
-    name: Execute GPU integer benchmarks
+    name: Cuda ERC20 benchmarks (${{ inputs.profile }})
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -87,7 +111,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -123,7 +147,7 @@ jobs:
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n3-H100x2" \
+          --hardware "${{ inputs.hardware_name }}" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -132,12 +156,6 @@ jobs:
          --walk-subdirs \
          --name-suffix avx512

-      - name: Parse PBS counts
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
-          --append-results
-
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
@@ -168,17 +186,17 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
-          SLACK_MESSAGE: "ERC20 2xH100 benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Cuda ERC20 benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-erc20-benchmarks)
+    name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -192,4 +210,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-erc20-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_erc20_weekly.yml
+++ b/.github/workflows/benchmark_gpu_erc20_weekly.yml
@@ -0,0 +1,35 @@
+# Run CUDA ERC20 benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+name: Cuda ERC20 weekly benchmarks
+
+on:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 5a.m.
+    - cron: '0 5 * * 6'
+
+jobs:
+  run-benchmarks-1-h100:
+    name: Run benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+    secrets: inherit
+
+  run-benchmarks-2-h100:
+    name: Run benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+    secrets: inherit
+
+  run-benchmarks-8-h100:
+    name: Run benchmarks (8xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100x8
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -1,201 +1,78 @@
-# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer GPU benchmarks
+# Run CUDA benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
+name: Cuda benchmarks

 on:
  workflow_dispatch:
-  push:
-    branches:
-      - main
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+      command:
+        description: "Benchmark command to run"
+        type: choice
+        default: integer_multi_bit
+        options:
+          - integer
+          - integer_multi_bit
+          - integer_compression
+          - pbs
+          - ks
+      op_flavor:
+        description: "Operations set to run"
+        type: choice
+        default: default
+        options:
+          - default
+          - fast_default
+          - unchecked
+      all_precisions:
+        description: "Run all precisions"
+        type: boolean
+        default: false
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both

 jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-benchmarks)
+  parse-inputs:
    runs-on: ubuntu-latest
-    if:  github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-integer-benchmarks:
-    name: Execute GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
+      - name: Parse profile
+        id: parse_profile
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
+      - name: Parse hardware name
+        id: parse_hardware_name
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+  run-benchmarks:
+    name: Run benchmarks
+    needs: parse-inputs
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: ${{ needs.parse-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
+      command: ${{ inputs.command }}
+      op_flavor: ${{ inputs.op_flavor }}
+      bench_type: ${{ inputs.bench_type }}
+      all_precisions: ${{ inputs.all_precisions }}
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_integer_2H100_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_2H100_full.yml
@@ -1,194 +0,0 @@
-# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer 2xH100 benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: 2-h100
-
-  cuda-integer-full-2-gpu-benchmarks:
-    name: Execute 2xH100 integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x2" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
@@ -1,21 +1,47 @@
-# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer multi GPU Multi-bit benchmarks
+# Run integer benchmarks on CUDA instance and return parsed results to Slab CI bot.
+name: Cuda benchmarks - common

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
+      backend:
+        type: string
+        default: hyperstack
+      profile:
+        type: string
+        required: true
+      hardware_name:
+        type: string
+        required: true
+      command: # Use a comma separated values to generate an array
+        type: string
+        required: true
+      op_flavor: # Use a comma separated values to generate an array
+        type: string
+        required: true
+      bench_type:
+        type: string
+        default: latency
      all_precisions:
-        description: "Run all precisions"
        type: boolean
        default: false
-      fast_default:
-        description: "Run only deduplicated default operations without scalar variants"
-        type: boolean
-        default: false
-
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
+    secrets:
+      FHE_ACTIONS_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true

 env:
  CARGO_TERM_COLOR: always
@@ -28,32 +54,82 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE
-  BENCH_OP_FLAVOR: default

 jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+  prepare-matrix:
+    name: Prepare operations matrix
+    runs-on: ubuntu-latest
+    outputs:
+      command: ${{ steps.set_command.outputs.command }}
+      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
+    steps:
+      - name: Set single command
+        if: ${{ !contains(inputs.command, ',')}}
+        run: |
+          echo "COMMAND=[\"${{ inputs.command }}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set multiple commands
+        if: ${{ contains(inputs.command, ',')}}
+        run: |
+          PARSED_COMMAND=$(echo "${{ inputs.command }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
+          echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set single operations flavor
+        if: ${{ !contains(inputs.op_flavor, ',')}}
+        run: |
+          echo "OP_FLAVOR=[\"${{ inputs.op_flavor }}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set multiple operations flavors
+        if: ${{ contains(inputs.op_flavor, ',')}}
+        run: |
+          PARSED_OP_FLAVOR=$(echo "${{ inputs.op_flavor }}" | sed 's/[[:space:]]*,[[:space:]]*/", "/g')
+          echo "OP_FLAVOR=[\"${PARSED_OP_FLAVOR}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set benchmark types
+        run: |
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Set command output
+        id: set_command
+        run: |
+          echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Set operation flavor output
+        id: set_op_flavor
+        run: |
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
+    needs: prepare-matrix
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch' }}
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: multi-h100
+          backend: ${{ inputs.backend }}
+          profile: ${{ inputs.profile }}

-  cuda-integer-multi-bit-multi-gpu-benchmarks:
-    name: Execute multi GPU integer multi-bit benchmarks
-    needs: setup-instance
+  cuda-benchmarks:
+    name: Cuda benchmarks (${{ inputs.profile }})
+    needs: [ prepare-matrix, setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
    continue-on-error: true
@@ -61,6 +137,10 @@ jobs:
      fail-fast: false
      max-parallel: 1
      matrix:
+        command: ${{ fromJSON(needs.prepare-matrix.outputs.command) }}
+        op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
+        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
@@ -101,7 +181,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -132,29 +212,24 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
        run: |
          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"

-      - name: Should run fast subset benchmarks
-        if: inputs.fast_default
+      - name: Run benchmarks
        run: |
-          echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make bench_unsigned_integer_multi_bit_gpu
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}_gpu

      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n3-H100x8" \
+          --hardware "${{ inputs.hardware_name }}" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -162,12 +237,12 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_integer
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -178,26 +253,26 @@ jobs:

  slack-notify:
    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    needs: [ setup-instance, cuda-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-benchmarks.result }}
+          SLACK_MESSAGE: "Cuda benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+    name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -211,4 +286,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_full.yml
@@ -1,200 +0,0 @@
-# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer GPU full benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-full-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-integer-full-benchmarks:
-    name: Execute GPU integer benchmarks for all operations flavor
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      # Run these benchmarks only once
-      - name: Run compression benchmarks with AVX512
-        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
-        run: |
-          make bench_integer_compression_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_bit.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit.yml
@@ -1,224 +0,0 @@
-# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
-name: Integer GPU Multi-bit benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      all_precisions:
-        description: "Run all precisions"
-        type: boolean
-        default: false
-      fast_default:
-        description: "Run only deduplicated default operations without scalar variants"
-        type: boolean
-        default: false
-
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  FAST_BENCH: TRUE
-  BENCH_OP_FLAVOR: default
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-multi-bit-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-integer-multi-bit-benchmarks:
-    name: Execute GPU integer multi-bit benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Should run benchmarks with all precisions
-        if: inputs.all_precisions
-        run: |
-          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
-
-      - name: Should run fast subset benchmarks
-        if: inputs.fast_default
-        run: |
-          echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make bench_unsigned_integer_multi_bit_gpu
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
@@ -1,194 +0,0 @@
-# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer multi GPU full benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-full-multi-gpu-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: multi-h100
-
-  cuda-integer-full-multi-gpu-benchmarks:
-    name: Execute multi GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x8" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_weekly.yml
+++ b/.github/workflows/benchmark_gpu_integer_weekly.yml
@@ -0,0 +1,60 @@
+# Run CUDA benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+name: Cuda weekly benchmarks
+
+on:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+jobs:
+  run-benchmarks-1-h100:
+    name: Run benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: integer,integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
+
+  run-benchmarks-2-h100:
+    name: Run benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+      command: integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
+
+  run-benchmarks-8-h100:
+    name: Run benchmarks (8xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100x8
+      command: integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
+
+  run-benchmarks-l40:
+    name: Run benchmarks (L40)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: l40
+      hardware_name: n3-L40x1
+      command: integer_multi_bit,integer_compression,pbs,ks
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_l40.yml
+++ b/.github/workflows/benchmark_gpu_l40.yml
@@ -1,206 +0,0 @@
-# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
-name: Cuda benchmarks (L40)
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-l40-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: l40 
-
-  cuda-l40-benchmarks:
-    name: Cuda benchmarks (L40)
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Run compression benchmarks with AVX512
-        run: |
-          make bench_integer_compression_gpu
-
-      - name: Run PBS benchmarks 
-        run: |
-          make bench_pbs_gpu
-
-      - name: Run KS benchmarks 
-        run: |
-          make bench_ks_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-L40x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-l40-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
-          SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-l40-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -8,6 +8,14 @@ on:
        description: "Run all precisions"
        type: boolean
        default: false
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both

  schedule:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
@@ -36,10 +44,10 @@ jobs:
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
    steps:
      - name: Weekly benchmarks
-        if: github.event_name == 'workflow_dispatch' ||
-          github.event.schedule == '0 1 * * 6'
+        if: github.event.schedule == '0 1 * * 6'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

@@ -48,11 +56,31 @@ jobs:
        run: |
          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"

-      -  name: Set operation flavor output
-         id: set_op_flavor
-         run: |
+      - name: Set benchmark types
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Default benchmark type
+        if: github.event_name != 'workflow_dispatch'
+        run: |
+          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
+
+      - name: Set operation flavor output
+        id: set_op_flavor
+        run: |
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  setup-instance:
    name: Setup instance (integer-benchmarks)
    needs: prepare-matrix
@@ -62,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -85,6 +113,7 @@ jobs:
      matrix:
        command: [ integer, integer_multi_bit]
        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -101,7 +130,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -119,13 +148,13 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}

-      # Run these benchmarks only once
+      # Run these benchmarks only once per benchmark type
      - name: Run compression benchmarks with AVX512
        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
        run: |
-          make bench_integer_compression
+          make BENCH_TYPE=${{ matrix.bench_type }} bench_integer_compression

      - name: Parse results
        run: |
@@ -138,12 +167,12 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -168,7 +197,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -120,8 +120,7 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      # This small benchmark needs to be executed only once.
      - name: Measure key sizes
@@ -164,7 +163,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -8,6 +8,14 @@ on:
        description: "Run all precisions"
        type: boolean
        default: false
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both

  schedule:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
@@ -36,10 +44,10 @@ jobs:
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
    steps:
      - name: Weekly benchmarks
-        if: github.event_name == 'workflow_dispatch' ||
-          github.event.schedule == '0 1 * * 6'
+        if: github.event.schedule == '0 1 * * 6'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

@@ -48,11 +56,31 @@ jobs:
        run: |
          echo "OP_FLAVOR=[\"default\", \"unchecked\"]" >> "${GITHUB_ENV}"

+      - name: Set benchmark types
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Default benchmark type
+        if: github.event_name != 'workflow_dispatch'
+        run: |
+          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
+
      - name: Set operation flavor output
        id: set_op_flavor
        run: |
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  setup-instance:
    name: Setup instance (signed-integer-benchmarks)
    needs: prepare-matrix
@@ -62,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +112,8 @@ jobs:
      max-parallel: 1
      matrix:
        command: [ integer, integer_multi_bit ]
-        op_flavor: [ default, unchecked ]
+        op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -101,7 +130,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -119,7 +148,7 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_signed_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -132,12 +161,12 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -162,7 +191,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -16,6 +16,9 @@ on:
  push:
    branches:
      - "main"
+    paths:
+      - tfhe-fft/**
+      - .github/workflows/benchmark_tfhe_fft.yml
  schedule:
    # Job will be triggered each Thursday at 11p.m.
    - cron: '0 23 * * 4'
@@ -29,7 +32,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,7 +50,7 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0

@@ -65,11 +68,11 @@ jobs:
          toolchain: nightly
          override: true

-      - name: Run benchmarks
+      - name: Run benchmarks with AVX512
        run: |
-          make FFT128_SUPPORT=ON bench
+          make bench_fft

-      - name: Parse results
+      - name: Parse AVX512 results
        run: |
          python3 ./ci/fft_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database concrete_fft \
@@ -77,28 +80,17 @@ jobs:
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}"
-
-          rm -rf target/criterion benchmarks_parameters/
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FFT128_SUPPORT=ON AVX512_SUPPORT=ON bench
-
-      - name: Parse AVX512 results
-        run: |
-          python3 ./ci/fft_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --name-suffix avx512 \
-          --append-results
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          repository: zama-ai/slab
          path: slab
@@ -121,7 +113,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-fft benchmarks failed. (${{ env.ACTION_RUN_URL }})"
@@ -134,7 +126,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,7 +137,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "EC2 teardown (fft-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -0,0 +1,143 @@
+# Run NTT benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: NTT benchmarks
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  RUST_BACKTRACE: "full"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+    paths:
+      - tfhe-ntt/**
+      - .github/workflows/benchmark_tfhe_ntt.yml
+  schedule:
+    # Job will be triggered each Friday at 11p.m.
+    - cron: "0 23 * * 5"
+
+jobs:
+  setup-ec2:
+    name: Setup EC2 instance (ntt-benchmarks)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  ntt-benchmarks:
+    name: Execute NTT benchmarks in EC2
+    needs: setup-ec2
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Run benchmarks
+        run: |
+          make bench_ntt
+
+      - name: Parse results
+        run: |
+          python3 ./ci/ntt_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database concrete_ntt \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        with:
+          name: ${{ github.sha }}_ntt
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-ntt benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-ec2:
+    name: Teardown EC2 instance (ntt-benchmarks)
+    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
+    needs: [setup-ec2, ntt-benchmarks]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-ec2.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -58,7 +58,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +91,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -118,8 +118,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -156,7 +155,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -36,16 +36,17 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
            wasm_bench:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-zk-pok/**
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
@@ -64,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -98,7 +99,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -108,7 +109,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
+        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
        with:
          path: |
            ~/.nvm
@@ -121,7 +122,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
+        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -199,7 +200,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -3,6 +3,16 @@ name: PKE ZK benchmarks

 on:
  workflow_dispatch:
+    inputs:
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both
+
  push:
    branches:
      - main
@@ -33,16 +43,17 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
            zk_pok:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-fft/**
              - tfhe-zk-pok/**
              - tfhe/src/core_crypto/**
@@ -52,10 +63,37 @@ jobs:
              - tfhe/benches/integer/zk_pke.rs
              - .github/workflows/zk_pke_benchmark.yml

+  prepare-matrix:
+    name: Prepare operations matrix
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
+    steps:
+      - name: Set benchmark types
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Default benchmark type
+        if: github.event_name != 'workflow_dispatch'
+        run: |
+          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
+
+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  setup-instance:
    name: Setup instance (pke-zk-benchmarks)
    runs-on: ubuntu-latest
-    needs: should-run
+    needs: [ should-run, prepare-matrix ]
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' &&
@@ -66,7 +104,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,11 +116,15 @@ jobs:
  pke-zk-benchmarks:
    name: Execute PKE ZK benchmarks
    if: needs.setup-instance.result != 'skipped'
-    needs: setup-instance
+    needs: [ prepare-matrix, setup-instance ]
    concurrency:
      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      max-parallel: 1
+      matrix:
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -99,7 +141,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -112,7 +154,7 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make bench_integer_zk
+          make BENCH_TYPE=${{ matrix.bench_type }} bench_integer_zk

      - name: Parse results
        run: |
@@ -126,7 +168,7 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ matrix.bench_type }}

      - name: Parse CRS sizes results
        run: |
@@ -169,7 +211,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -28,7 +28,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -47,10 +47,10 @@ jobs:
        run: |
          make pcc

-      - name: Build concrete-csprng
+      - name: Build tfhe-csprng
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
-          make build_concrete_csprng
+          make build_tfhe_csprng

      - name: Build Release core
        if: ${{ contains(matrix.os, 'ubuntu') }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -2,9 +2,6 @@
 name: Cargo Build tfhe-fft

 on:
-  push:
-    branches:
-      - 'main'
  pull_request:

 env:
@@ -24,7 +21,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -0,0 +1,40 @@
+# Build tfhe-ntt
+name: Cargo Build tfhe-ntt
+
+on:
+  pull_request:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  cargo-builds:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Run pcc checks
+        run: |
+          make pcc_ntt
+
+      - name: Build release
+        run: |
+          make build_ntt
+
+      - name: Build release no-std
+        run: |
+          make build_ntt_no_std
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -2,9 +2,6 @@
 name: Cargo Test tfhe-fft

 on:
-  push:
-    branches:
-      - "main"
  pull_request:

 env:
@@ -22,7 +19,7 @@ jobs:
        runner_type: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -47,7 +44,7 @@ jobs:
      matrix:
        runner_type: [ubuntu-latest, macos-latest, windows-latest]
    steps:
-      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -66,7 +63,7 @@ jobs:
  cargo-tests-node-js:
    runs-on: "ubuntu-latest"
    steps:
-      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Test node js
        run: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -0,0 +1,54 @@
+# Test tfhe-ntt
+name: Cargo Test tfhe-ntt
+
+on:
+  pull_request:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  cargo-tests:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Test debug
+        run: make test_ntt
+
+      - name: Test no-std
+        run: make test_ntt_no_std
+
+  cargo-tests-nightly:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Test nightly
+        run: make test_ntt_nightly
+
+      - name: Test no-std nightly
+        run: make test_ntt_no_std_nightly
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -27,7 +27,7 @@ jobs:
          make lint_workflow

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@38608ef4fb69adae7f1eac6eeb88e67b7d083bfd # v3.0.16
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@5d6ac37a4cef8b8df67f482a8e384987766f0213 # v3.0.17
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,19 +47,19 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          files_yaml: |
            tfhe:
              - tfhe/src/**
-            concrete_csprng:
-              - concrete-csprng/src/**
+            tfhe_csprng:
+              - tfhe-csprng/src/**

      - name: Generate Keys
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
@@ -83,7 +83,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@5c47607acb93fed5485fdbf7232e8a31425f672a
+        uses: codecov/codecov-action@7f8b4b4bde536c465e797be725718b88c5d95e0e
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@5c47607acb93fed5485fdbf7232e8a31425f672a
+        uses: codecov/codecov-action@7f8b4b4bde536c465e797be725718b88c5d95e0e
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -121,7 +121,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -65,7 +65,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (csprng-randomness-tests)
@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -40,7 +40,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -31,10 +31,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -67,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -120,7 +121,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -186,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -30,10 +30,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -118,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -184,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -76,7 +76,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -139,7 +139,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -31,10 +31,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -67,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -120,7 +121,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -189,7 +190,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -0,0 +1,146 @@
+name: AWS Long Run Tests on GPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  schedule:
+    # Weekly tests will be triggered each Friday at 1a.m.
+    - cron: '0 1 * * FRI'
+
+jobs:
+  setup-instance:
+    name: Setup instance (gpu-tests)
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: 2-h100
+
+  cuda-tests:
+    name: Long run GPU H100 tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run tests
+        run: |
+          make test_integer_long_run_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests.result }}
+          SLACK_MESSAGE: "Integer GPU H100 long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (gpu-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (gpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -110,7 +110,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -171,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -171,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -38,10 +38,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -74,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -128,7 +129,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -188,7 +189,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -171,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -171,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -74,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -125,7 +125,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -185,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -0,0 +1,94 @@
+name: AWS Long Run Tests on CPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  schedule:
+    # Weekly tests will be triggered each Friday at 1a.m.
+    - cron: '0 1 * * FRI'
+
+jobs:
+  setup-instance:
+    name: Setup instance (cpu-tests)
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  cpu-tests:
+    name: Long run CPU tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        with:
+          toolchain: stable
+
+      - name: Run tests
+        run: |
+          make test_integer_long_run
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CPU long run tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cpu-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cpu-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -39,7 +39,7 @@ jobs:
          persist-credentials: "false"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -64,13 +64,29 @@ jobs:
          make test_fft_no_std_nightly
          # we don't run the js stuff here as it's causing issues with the M1 config

+      - name: Run pcc NTT checks
+        run: |
+          make pcc_ntt
+
+      - name: Build NTT release
+        run: |
+          make build_ntt
+
+      - name: Build NTT release no-std
+        run: |
+          make build_ntt_no_std
+
+      - name: Run NTT tests
+        run: |
+          make test_ntt_all
+
      - name: Run pcc checks
        run: |
          make pcc

-      - name: Build concrete-csprng
+      - name: Build tfhe-csprng
        run: |
-          make build_concrete_csprng
+          make build_tfhe_csprng

      - name: Build Release core
        run: |
@@ -96,9 +112,9 @@ jobs:
        run: |
          make build_c_api

-      - name: Run concrete-csprng tests
+      - name: Run tfhe-csprng tests
        run: |
-          make test_concrete_csprng
+          make test_tfhe_csprng

      - name: Run tfhe-zk-pok tests
        run: |
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -46,6 +46,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe
@@ -84,6 +85,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Create NPM version tag
        if: ${{ inputs.npm_latest_tag }}
        run: |
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -1,4 +1,4 @@
-name: Publish concrete-csprng release
+name: Publish tfhe-csprng release

 on:
  workflow_dispatch:
@@ -19,7 +19,7 @@ jobs:
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  publish_release:
-    name: Publish concrete-csprng Release
+    name: Publish tfhe-csprng Release
    needs: verify_tag
    runs-on: ubuntu-latest
    steps:
@@ -27,13 +27,14 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p concrete-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          cargo publish -p tfhe-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -43,6 +44,6 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -36,7 +36,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,13 +64,14 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -119,7 +120,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -25,12 +25,13 @@ jobs:
    needs: verify_tag
    steps:
      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0

      - name: Publish crate.io package
        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          cargo publish -p tfhe-fft --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
@@ -38,7 +39,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -0,0 +1,49 @@
+# Publish new release of tfhe-ntt
+name: Publish tfhe-ntt release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  publish_release:
+    name: Publish tfhe-ntt Release
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-ntt --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-ntt release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -27,6 +27,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Publish proc-macro crate
        env:
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -28,6 +28,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Publish crate.io package
        env:
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -16,6 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: git-sync
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,8 @@ target/
 **/*.bin

 # Some of our bench outputs
-/tfhe/benchmarks_parameters
-/tfhe-zk-pok/benchmarks_parameters
+/crates/tfhe/benchmarks_parameters
+/crates/tfhe-zk-pok/benchmarks_parameters
 **/*.csv

 # dieharder run log
@@ -26,11 +26,11 @@ dieharder_run.log
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/

 # WASM tests
-tfhe/web_wasm_parallel_tests/server.PID
+crates/tfhe/tests/web_wasm_parallel/server.PID
 venv/
 web-test-runner/
 node_modules/
 package-lock.json

 # Dir used for backward compatibility test data
-tfhe/tfhe-backward-compat-data/
+crates/tfhe/tfhe-backward-compat-data/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,22 +1,18 @@
 [workspace]
 resolver = "2"
 members = [
-    "tfhe",
-    "tfhe-fft",
-    "tfhe-zk-pok",
+    "crates/*",
    "tasks",
    "apps/trivium",
-    "concrete-csprng",
    "backends/tfhe-cuda-backend",
-    "utils/tfhe-versionable",
-    "utils/tfhe-versionable-derive",
 ]

 exclude = [
-    "tfhe/backward_compatibility_tests",
+    "crates/tfhe/backward_compatibility_tests",
    "utils/cargo-tfhe-lints-inner",
    "utils/cargo-tfhe-lints"
 ]
+
 [workspace.dependencies]
 aligned-vec = { version = "0.5", default-features = false }
 bytemuck = "1.14.3"
@@ -26,6 +22,19 @@ pulp = { version = "0.18.22", default-features = false }
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = ">=0.2.86,<0.2.94"

+[workspace.package]
+version = "0.11.0"
+license = "BSD-3-Clause-Clear"
+repository = "https://github.com/zama-ai/tfhe-rs"
+documentation = "https://docs.zama.ai/tfhe-rs"
+
+[workspace.lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = [
+    'cfg(bench)',
+    'cfg(tarpaulin)',
+    'cfg(tfhe_lints)',
+] }
+
 [profile.bench]
 lto = "fat"

--- a/190
+++ b/190
@@ -5,9 +5,10 @@ CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
 TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
+TFHE_SRC:=crates/tfhe
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
 CARGO_PROFILE?=release
-MIN_RUST_VERSION:=$(shell grep '^rust-version[[:space:]]*=' tfhe/Cargo.toml | cut -d '=' -f 2 | xargs)
+MIN_RUST_VERSION:=$(shell grep '^rust-version[[:space:]]*=' $(TFHE_SRC)/Cargo.toml | cut -d '=' -f 2 | xargs)
 AVX512_SUPPORT?=OFF
 WASM_RUSTFLAGS:=
 BIG_TESTS_INSTANCE?=FALSE
@@ -18,6 +19,7 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 NIGHTLY_TESTS?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
+BENCH_TYPE?=latency
 NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
 BACKWARD_COMPAT_DATA_BRANCH?=v0.4
@@ -27,7 +29,7 @@ TFHE_SPEC:=tfhe
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
 WASM_BINDGEN_VERSION:=$(shell grep '^wasm-bindgen[[:space:]]*=' Cargo.toml | cut -d '"' -f 2 | xargs)
 WEB_RUNNER_DIR=web-test-runner
-WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
+WEB_SERVER_DIR=tfhe/tests/web_wasm_parallel
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
 # copy paste the command in the terminal and change them if required without forgetting the flags
 export RUSTFLAGS?=-C target-cpu=native
@@ -242,7 +244,7 @@ fmt_js: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt
+	$(MAKE) -C $(TFHE_SRC)/tests/web_wasm_parallel fmt

 .PHONY: fmt_gpu # Format rust and cuda code
 fmt_gpu: install_rs_check_toolchain
@@ -251,7 +253,7 @@ fmt_gpu: install_rs_check_toolchain

 .PHONY: fmt_c_tests # Format c tests
 fmt_c_tests:
-	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format -style=file -i {} \;
+	find $(TFHE_SRC)/tests/c_api/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format -style=file -i {} \;

 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
@@ -259,7 +261,7 @@ check_fmt: install_rs_check_toolchain

 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
-	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
+	find $(TFHE_SRC)/tests/c_api/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;

 .PHONY: check_fmt_gpu # Check rust and cuda code format
 check_fmt_gpu: install_rs_check_toolchain
@@ -271,7 +273,7 @@ check_fmt_js: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
+	$(MAKE) -C $(TFHE_SRC)/tests/web_wasm_parallel check_fmt

 .PHONY: check_typos # Check for typos in codebase
 check_typos: install_typos_checker
@@ -362,7 +364,7 @@ clippy_rustdoc: install_rs_check_toolchain
 	fi && \
 	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats,strings \
 		-p $(TFHE_SPEC)

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -393,17 +395,17 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

-.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
-clippy_concrete_csprng: install_rs_check_toolchain
+.PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
+clippy_tfhe_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE) \
-		-p concrete-csprng -- --no-deps -D warnings
+		-p tfhe-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
 clippy_zk_pok: install_rs_check_toolchain
@@ -419,12 +421,12 @@ clippy_versionable: install_rs_check_toolchain

 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
 clippy_versionable

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
-clippy_core clippy_concrete_csprng
+clippy_core clippy_tfhe_csprng

 .PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
 clippy_cuda_backend: install_rs_check_toolchain
@@ -441,7 +443,7 @@ check_rust_bindings_did_not_change:

 .PHONY: tfhe_lints # Run custom tfhe-rs lints
 tfhe_lints: install_tfhe_lints
-	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
+	cd $(TFHE_SRC) && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings

 .PHONY: build_core # Build core_crypto without experimental features
@@ -507,33 +509,33 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
-	cd tfhe && \
+	cd $(TFHE_SRC) && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=web \
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
-	cd tfhe && \
+	cd $(TFHE_SRC) && \
 	rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
 		wasm-pack build --release --target=web \
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
 		-Z build-std=panic_abort,std && \
 	find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
-	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
+	jq '.files += ["snippets"]' $(TFHE_SRC)/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json $(TFHE_SRC)/pkg/package.json

 .PHONY: build_node_js_api # Build the js API targeting nodejs
 build_node_js_api: install_rs_build_toolchain install_wasm_pack
-	cd tfhe && \
+	cd $(TFHE_SRC) && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=nodejs \
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok

-.PHONY: build_concrete_csprng # Build concrete_csprng
-build_concrete_csprng: install_rs_build_toolchain
+.PHONY: build_tfhe_csprng # Build tfhe_csprng
+build_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng --all-targets
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng --all-targets

 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
@@ -584,6 +586,11 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

+.PHONY: test_integer_long_run_gpu # Run the tests of the integer module including experimental on the gpu backend
+test_integer_long_run_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu,__long_run_tests -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run --test-threads=6
+
 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -764,6 +771,12 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

+.PHONY: test_integer_long_run # Run the long run tests for integer
+test_integer_long_run: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+						--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,__long_run_tests -p $(TFHE_SPEC) -- integer::server_key::radix_parallel::tests_long_run
+
+
 .PHONY: test_safe_serialization # Run the tests for safe serialization
 test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -802,7 +815,7 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,strings -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer,strings -p $(TFHE_SPEC) \
 		-- strings::


@@ -846,10 +859,10 @@ test_kreyvium: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-trivium -- --test-threads=1 kreyvium::

-.PHONY: test_concrete_csprng # Run concrete-csprng tests
-test_concrete_csprng: install_rs_build_toolchain
+.PHONY: test_tfhe_csprng # Run tfhe-csprng tests
+test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng

 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
@@ -865,7 +878,7 @@ test_zk_wasm_x86_compat_ci: check_nvm_installed

 .PHONY: test_zk_wasm_x86_compat # Check compatibility between wasm and x86_64 proofs
 test_zk_wasm_x86_compat: install_rs_build_toolchain build_node_js_api
-	cd tfhe/tests/zk_wasm_x86_test && npm install
+	cd $(TFHE_SRC)/tests/zk_wasm_x86_test && npm install
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe --test zk_wasm_x86_test --features=$(TARGET_ARCH_FEATURE),integer,zk-pok

@@ -879,11 +892,11 @@ test_versionable: install_rs_build_toolchain
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
+		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"$(TFHE_SRC)/$(BACKWARD_COMPAT_DATA_DIR)\"" \
 		--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
-test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
+test_backward_compatibility: $(TFHE_SRC)/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci

 .PHONY: backward_compat_branch # Prints the required backward compatibility branch
 backward_compat_branch:
@@ -895,7 +908,7 @@ doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -906,7 +919,7 @@ lint_doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -926,11 +939,11 @@ check_md_docs_are_tested:

 .PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
 check_intra_md_links: install_mlc
-	mlc --offline --match-file-extension tfhe/docs
+	mlc --offline --match-file-extension $(TFHE_SRC)/docs

 .PHONY: check_md_links # Checks all broken links in Markdown docs
 check_md_links: install_mlc
-	mlc --match-file-extension tfhe/docs
+	mlc --match-file-extension $(TFHE_SRC)/docs

 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests: install_rs_build_toolchain
@@ -955,7 +968,7 @@ check_compile_tests_benches_gpu: install_rs_build_toolchain

 .PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
 test_nodejs_wasm_api: build_node_js_api
-	cd tfhe/js_on_wasm_tests && npm install && npm run test
+	cd $(TFHE_SRC)/tests/js_on_wasm && npm install && npm run test

 .PHONY: test_nodejs_wasm_api_ci # Run tests for the nodejs on wasm API
 test_nodejs_wasm_api_ci: build_node_js_api
@@ -1015,7 +1028,7 @@ no_dbg_log:
 	@./scripts/no_dbg_calls.sh

 .PHONY: dieharder_csprng # Run the dieharder test suite on our CSPRNG implementation
-dieharder_csprng: install_dieharder build_concrete_csprng
+dieharder_csprng: install_dieharder build_tfhe_csprng
 	./scripts/dieharder_test.sh

 #
@@ -1029,40 +1042,42 @@ print_doc_bench_parameters:

 .PHONY: bench_integer # Run benchmarks for unsigned integer
 bench_integer: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
@@ -1070,7 +1085,7 @@ bench_integer_multi_bit: install_rs_check_toolchain

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
@@ -1078,23 +1093,23 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
+	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
+	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
@@ -1116,7 +1131,7 @@ bench_shortint_oprf: install_rs_check_toolchain

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
@@ -1261,9 +1276,9 @@ write_params_to_file: install_rs_check_toolchain

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
 clone_backward_compat_data:
-	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
+	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) $(TFHE_SRC)/$(BACKWARD_COMPAT_DATA_DIR)

-tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
+$(TFHE_SRC)/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data

 #
 # Real use case examples
@@ -1341,7 +1356,7 @@ build_fft: install_rs_build_toolchain
 		--features=fft128

 .PHONY: build_fft_no_std
-buildfft__no_std: install_rs_build_toolchain
+build_fft_no_std: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
 		--no-default-features
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
@@ -1413,6 +1428,79 @@ bench_fft: install_rs_check_toolchain
 		--features=fft128
 #============================End FFT Section ==================================

+#=============================== NTT Section ==================================
+.PHONY: doc_ntt # Build rust doc for tfhe-ntt
+doc_ntt: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
+	RUSTDOCFLAGS="--html-in-header katex-header.html" \
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
+		--all-features --no-deps -p tfhe-ntt
+
+.PHONY: docs_ntt # Build rust doc tfhe-ntt, alias for doc
+docs_ntt: doc_ntt
+
+.PHONY: lint_doc_ntt # Build rust doc for tfhe-ntt with linting enabled
+lint_doc_ntt: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
+	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
+		--all-features --no-deps -p tfhe-ntt
+
+.PHONY: lint_docs_ntt # Build rust doc for tfhe-ntt with linting enabled, alias for lint_doc
+lint_docs_ntt: lint_doc_ntt
+
+.PHONY: clippy_ntt # Run clippy lints on tfhe-ntt
+clippy_ntt: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--all-features -p tfhe-ntt -- --no-deps -D warnings
+
+.PHONY: pcc_ntt # pcc stands for pre commit checks
+pcc_ntt: check_fmt lint_doc_ntt clippy_ntt
+
+.PHONY: build_ntt
+build_ntt: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-ntt
+
+.PHONY: build_ntt_no_std
+build_ntt_no_std: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-ntt \
+		--no-default-features
+
+##### Tests #####
+
+.PHONY: test_ntt
+test_ntt: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt
+
+.PHONY: test_ntt_nightly
+test_ntt_nightly: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-ntt \
+		--features=nightly
+
+.PHONY: test_ntt_no_std
+test_ntt_no_std: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt \
+		--no-default-features 
+
+.PHONY: test_ntt_no_std_nightly
+test_ntt_no_std_nightly: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-ntt \
+		--no-default-features \
+		--features=nightly
+
+.PHONY: test_ntt_all
+test_ntt_all: test_ntt test_ntt_no_std test_ntt_nightly test_ntt_no_std_nightly
+
+##### Bench #####
+
+.PHONY: bench_ntt # Run NTT benchmarks
+bench_ntt: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" bench --bench ntt -p tfhe-ntt \
+		--features=nightly
+#============================End NTT Section ==================================
+
 .PHONY: help # Generate list of targets with descriptions
 help:
 	@grep '^\.PHONY: .* #' Makefile | sed 's/\.PHONY: \(.*\) # \(.*\)/\1\t\2/' | expand -t30 | sort
--- a/apps/trivium/Cargo.toml
+++ b/apps/trivium/Cargo.toml
@@ -9,11 +9,11 @@ edition = "2021"
 rayon = { version = "1.7.0"}

 [target.'cfg(target_arch = "x86_64")'.dependencies.tfhe]
-path = "../../tfhe"
+path = "../../crates/tfhe"
 features = [ "boolean", "shortint", "integer", "x86_64" ]

 [target.'cfg(target_arch = "aarch64")'.dependencies.tfhe]
-path = "../../tfhe"
+path = "../../crates/tfhe"
 features = [ "boolean", "shortint", "integer", "aarch64-unix" ]

 [dev-dependencies]
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -27,12 +27,23 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

+cudaEvent_t cuda_create_event(uint32_t gpu_index);
+
+void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
+                       uint32_t gpu_index);
+void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
+                            uint32_t gpu_index);
+
+void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index);
+
 cudaStream_t cuda_create_stream(uint32_t gpu_index);

 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);

+uint32_t cuda_is_available();
+
 void *cuda_malloc(uint64_t size, uint32_t gpu_index);

 void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -102,13 +102,12 @@ template <typename Torus> struct int_decompression {
      };

      generate_device_accumulator<Torus>(
-          streams[0], gpu_indexes[0],
-          carry_extract_lut->get_lut(gpu_indexes[0], 0),
+          streams[0], gpu_indexes[0], carry_extract_lut->get_lut(0, 0),
          encryption_params.glwe_dimension, encryption_params.polynomial_size,
          encryption_params.message_modulus, encryption_params.carry_modulus,
          carry_extract_f);

-      carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+      carry_extract_lut->broadcast_lut(streams, gpu_indexes, 0);
    }
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -35,6 +35,8 @@ enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

 enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };

+enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };
+
 extern "C" {
 void scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -103,17 +105,19 @@ void cleanup_cuda_full_propagation(void *const *streams,

 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory);

 void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left,
-    void const *radix_lwe_right, void *const *bsks, void *const *ksks,
-    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
+    void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
+    void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
+    void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks);

 void cleanup_cuda_integer_mult(void *const *streams,
                               uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -280,23 +284,61 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory);
+
+void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory);

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry);

-void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks);
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry);

 void cleanup_cuda_propagate_single_carry(void *const *streams,
                                         uint32_t const *gpu_indexes,
                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void);

+void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void);
+
+void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
+    bool allocate_gpu_memory);
+
+void cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow);
+
+void cleanup_cuda_integer_overflowing_sub(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -316,25 +358,6 @@ void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
-    void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks_in_radix);
-
-void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void);
-
 void scratch_cuda_integer_scalar_mul_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -357,42 +380,23 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,

 void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory);

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    uint32_t num_blocks_in_radix);

 void cleanup_cuda_integer_div_rem(void *const *streams,
                                  uint32_t const *gpu_indexes,
                                  uint32_t gpu_count, int8_t **mem_ptr_void);

-void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
-
-void cleanup_signed_overflowing_add_or_sub(void *const *streams,
-                                           uint32_t const *gpu_indexes,
-                                           uint32_t gpu_count,
-                                           int8_t **mem_ptr_void);
-
 void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -27,6 +27,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       void const *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
+
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -28,7 +28,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 #endif

 template <typename Torus>
@@ -46,7 +46,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -63,7 +63,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -255,7 +255,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

 template <typename Torus>
@@ -266,7 +266,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

 #if (CUDA_ARCH >= 900)
@@ -278,7 +278,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -69,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void const *lwe_input_indexes, void const *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -78,7 +78,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void const *lwe_input_indexes, void const *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -27,7 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void const *lwe_input_indexes, void const *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,6 +2,30 @@
 #include <cstdint>
 #include <cuda_runtime.h>

+cudaEvent_t cuda_create_event(uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  cudaEvent_t event;
+  check_cuda_error(cudaEventCreate(&event));
+  return event;
+}
+
+void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
+                       uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaEventRecord(event, stream));
+}
+
+void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
+                            uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
+}
+
+void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaEventDestroy(event));
+}
+
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
@@ -21,6 +45,9 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
  check_cuda_error(cudaStreamSynchronize(stream));
 }

+// Determine if a CUDA device is available at runtime
+uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
+
 /// Unsafe function that will try to allocate even if gpu_index is invalid
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -58,9 +58,11 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
                       radix_params.big_lwe_dimension, num_blocks);

-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
-                                     nullptr, nullptr, mem_ptr->scp_mem, bsks,
-                                     ksks, num_blocks);
+  uint32_t requested_flag = outputFlag::FLAG_NONE;
+  uint32_t uses_carry = 0;
+  host_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
+      bsks, ksks, num_blocks, requested_flag, uses_carry);

  host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
                              mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
@@ -1,50 +0,0 @@
-#include "integer/addition.cuh"
-
-void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
-
-  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
-                                                : SIGNED_OPERATION::SUBTRACTION;
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
-      num_blocks, op, params, allocate_gpu_memory);
-}
-
-void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks) {
-
-  auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
-  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
-                                                : SIGNED_OPERATION::SUBTRACTION;
-
-  host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lhs), static_cast<uint64_t const *>(rhs),
-      static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t *const *)(ksks),
-      mem, num_blocks);
-}
-
-void cleanup_signed_overflowing_add_or_sub(void *const *streams,
-                                           uint32_t const *gpu_indexes,
-                                           uint32_t gpu_count,
-                                           int8_t **mem_ptr_void) {
-  int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
-      (int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -1,149 +0,0 @@
-#ifndef TFHE_RS_ADDITION_CUH
-#define TFHE_RS_ADDITION_CUH
-
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer/comparison.cuh"
-#include "integer/integer.cuh"
-#include "integer/integer_utilities.h"
-#include "integer/negation.cuh"
-#include "integer/scalar_shifts.cuh"
-#include "linear_algebra.h"
-#include "pbs/programmable_bootstrap.h"
-#include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-template <typename Torus>
-void host_resolve_signed_overflow(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation,
-    Torus const *last_block_input_carry, Torus *last_block_output_carry,
-    int_resolve_signed_overflow_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
-
-  auto x = mem->x;
-
-  Torus *d_clears =
-      (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
-
-  cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
-
-  // replace with host function call
-  cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
-      mem->params.big_lwe_dimension, 1);
-
-  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                       last_block_inner_propagation, x,
-                       mem->params.big_lwe_dimension, 1);
-  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                       last_block_inner_propagation, last_block_input_carry,
-                       mem->params.big_lwe_dimension, 1);
-
-  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
-                                      last_block_inner_propagation,
-                                      mem->resolve_overflow_lut, ksks, bsks, 1);
-
-  cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count,
-    int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
-    uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_blocks, op,
-      allocate_gpu_memory);
-}
-
-/*
- * Addition - signed_operation = 1
- * Subtraction - signed_operation = -1
- */
-template <typename Torus>
-__host__ void host_integer_signed_overflowing_add_or_sub_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed,
-    SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks,
-    int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
-    uint32_t num_blocks) {
-
-  auto radix_params = mem_ptr->params;
-
-  uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
-  uint32_t big_lwe_size = big_lwe_dimension + 1;
-  uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
-
-  auto result = mem_ptr->result;
-  auto neg_rhs = mem_ptr->neg_rhs;
-  auto input_carries = mem_ptr->input_carries;
-  auto output_carry = mem_ptr->output_carry;
-  auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
-
-  cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
-                               streams[0], gpu_indexes[0]);
-
-  // phase 1
-  if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
-                         big_lwe_dimension, num_blocks);
-  } else {
-    host_integer_radix_negation<Torus>(
-        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
-        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                         big_lwe_dimension, num_blocks);
-  }
-
-  // phase 2
-  for (uint j = 0; j < gpu_count; j++) {
-    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-  }
-
-  host_propagate_single_carry<Torus>(
-      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
-      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation<Torus>(
-      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
-      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
-      ksks);
-
-  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
-    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
-    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
-  }
-
-  // phase 3
-  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
-  if (op == SIGNED_OPERATION::SUBTRACTION && num_blocks == 1) {
-    // Quick fix for the case where the subtraction is done on a single block
-    Torus *one_scalar =
-        (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
-    cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], one_scalar, 1, 1);
-    create_trivial_radix<Torus>(
-        streams[0], gpu_indexes[0], input_carry, one_scalar, big_lwe_dimension,
-        1, 1, radix_params.message_modulus, radix_params.carry_modulus);
-    cuda_drop_async(one_scalar, streams[0], gpu_indexes[0]);
-  }
-
-  host_resolve_signed_overflow<Torus>(
-      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
-      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
-
-  cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
-                               streams[0], gpu_indexes[0]);
-}
-
-#endif // TFHE_RS_ADDITION_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -125,11 +125,11 @@ __host__ void are_all_comparisons_block_true(
          return x == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
-            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+            streams[0], gpu_indexes[0], new_lut->get_lut(0, 0), glwe_dimension,
+            polynomial_size, message_modulus, carry_modulus,
            is_equal_to_num_blocks_lut_f);

-        new_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+        new_lut->broadcast_lut(streams, gpu_indexes, 0);

        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
        lut = new_lut;
@@ -449,9 +449,9 @@ __host__ void tree_sign_reduction(
    f = sign_handler_f;
  }
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], last_lut->get_lut(gpu_indexes[0], 0),
-      glwe_dimension, polynomial_size, message_modulus, carry_modulus, f);
-  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
+      polynomial_size, message_modulus, carry_modulus, f);
+  last_lut->broadcast_lut(streams, gpu_indexes, 0);

  // Last leaf
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -295,7 +295,7 @@ __host__ void host_integer_decompress(
  extracted_lwe = h_mem_ptr->tmp_extracted_lwe;

  // In the case of extracting a single LWE these parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;
  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
  /// dimension to a big LWE dimension
@@ -311,7 +311,7 @@ __host__ void host_integer_decompress(
        compression_params.small_lwe_dimension,
        encryption_params.polynomial_size, encryption_params.pbs_base_log,
        encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
+        num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
  } else {
    /// For multi GPU execution we create vectors of pointers for inputs and
    /// outputs
@@ -338,7 +338,7 @@ __host__ void host_integer_decompress(
        compression_params.small_lwe_dimension,
        encryption_params.polynomial_size, encryption_params.pbs_base_log,
        encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
+        num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -2,11 +2,12 @@

 void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -14,7 +15,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_div_rem_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
      (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }
@@ -22,7 +23,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    uint32_t num_blocks) {

  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
@@ -31,8 +32,8 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
      static_cast<const uint64_t *>(numerator),
-      static_cast<const uint64_t *>(divisor), bsks, (uint64_t **)(ksks), mem,
-      num_blocks);
+      static_cast<const uint64_t *>(divisor), is_signed, bsks,
+      (uint64_t **)(ksks), mem, num_blocks);
 }

 void cleanup_cuda_integer_div_rem(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -3,6 +3,7 @@

 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "integer/abs.cuh"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
@@ -161,22 +162,21 @@ template <typename Torus> struct lwe_ciphertext_list {
 template <typename Torus>
 __host__ void scratch_cuda_integer_div_rem_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_div_rem_memory<Torus> **mem_ptr,
+    uint32_t gpu_count, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {

-  *mem_ptr = new int_div_rem_memory<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
+  *mem_ptr =
+      new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                    is_signed, num_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
-__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, Torus *quotient,
-                                      Torus *remainder, Torus const *numerator,
-                                      Torus const *divisor, void *const *bsks,
-                                      uint64_t *const *ksks,
-                                      int_div_rem_memory<uint64_t> *mem_ptr,
-                                      uint32_t num_blocks) {
+__host__ void host_unsigned_integer_div_rem_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *quotient, Torus *remainder,
+    Torus const *numerator, Torus const *divisor, void *const *bsks,
+    uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr,
+    uint32_t num_blocks) {

  auto radix_params = mem_ptr->params;

@@ -425,11 +425,24 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    auto do_overflowing_sub = [&](cudaStream_t const *streams,
                                  uint32_t const *gpu_indexes,
                                  uint32_t gpu_count) {
-      host_integer_overflowing_sub_kb<Torus>(
-          streams, gpu_indexes, gpu_count, new_remainder.data,
-          subtraction_overflowed.data, merged_interesting_remainder.data,
-          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
+      uint32_t compute_borrow = 1;
+      uint32_t uses_input_borrow = 0;
+      auto first_indexes = mem_ptr->first_indexes_for_overflow_sub
+                               [merged_interesting_remainder.len - 1];
+      auto second_indexes = mem_ptr->second_indexes_for_overflow_sub
+                                [merged_interesting_remainder.len - 1];
+      auto scalar_indexes =
+          mem_ptr
+              ->scalars_for_overflow_sub[merged_interesting_remainder.len - 1];
+      mem_ptr->overflow_sub_mem->update_lut_indexes(
+          streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
          merged_interesting_remainder.len);
+      host_integer_overflowing_sub<uint64_t>(
+          streams, gpu_indexes, gpu_count, new_remainder.data,
+          (uint64_t *)merged_interesting_remainder.data,
+          interesting_divisor.data, subtraction_overflowed.data,
+          (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
+          merged_interesting_remainder.len, compute_borrow, uses_input_borrow);
    };

    // fills:
@@ -594,4 +607,108 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
  }
 }

+template <typename Torus>
+__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count, Torus *quotient,
+                                      Torus *remainder, Torus const *numerator,
+                                      Torus const *divisor, bool is_signed,
+                                      void *const *bsks, uint64_t *const *ksks,
+                                      int_div_rem_memory<uint64_t> *int_mem_ptr,
+                                      uint32_t num_blocks) {
+
+  if (is_signed) {
+    auto radix_params = int_mem_ptr->params;
+    uint32_t big_lwe_size = radix_params.big_lwe_dimension + 1;
+
+    // temporary memory
+    lwe_ciphertext_list<Torus> positive_numerator(
+        int_mem_ptr->positive_numerator, radix_params, num_blocks);
+    lwe_ciphertext_list<Torus> positive_divisor(int_mem_ptr->positive_divisor,
+                                                radix_params, num_blocks);
+
+    positive_numerator.clone_from((Torus *)numerator, 0, num_blocks - 1,
+                                  streams[0], gpu_indexes[0]);
+    positive_divisor.clone_from((Torus *)divisor, 0, num_blocks - 1, streams[0],
+                                gpu_indexes[0]);
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
+                               gpu_count, positive_numerator.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_1, true, num_blocks);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
+                               gpu_count, positive_divisor.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_2, true, num_blocks);
+    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+
+    host_unsigned_integer_div_rem_kb<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, remainder,
+        positive_numerator.data, positive_divisor.data, bsks, ksks,
+        int_mem_ptr->unsigned_mem, num_blocks);
+
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+        int_mem_ptr->sign_bits_are_different,
+        &numerator[big_lwe_size * (num_blocks - 1)],
+        &divisor[big_lwe_size * (num_blocks - 1)], bsks, ksks, 1,
+        int_mem_ptr->compare_signed_bits_lut,
+        int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
+
+    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+
+    host_integer_radix_negation(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+        int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
+        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
+
+    uint32_t requested_flag = outputFlag::FLAG_NONE;
+    uint32_t uses_carry = 0;
+    host_propagate_single_carry<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+        int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
+        bsks, ksks, num_blocks, requested_flag, uses_carry);
+
+    host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
+                                gpu_count, int_mem_ptr->negated_remainder,
+                                remainder, radix_params.big_lwe_dimension,
+                                num_blocks, radix_params.message_modulus,
+                                radix_params.carry_modulus);
+
+    host_propagate_single_carry<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+        int_mem_ptr->negated_remainder, nullptr, nullptr,
+        int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag,
+        uses_carry);
+
+    host_integer_radix_cmux_kb<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
+        int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
+        quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks);
+
+    host_integer_radix_cmux_kb<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
+        &numerator[big_lwe_size * (num_blocks - 1)],
+        int_mem_ptr->negated_remainder, remainder,
+        int_mem_ptr->cmux_remainder_mem, bsks, ksks, num_blocks);
+
+    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+  } else {
+    host_unsigned_integer_div_rem_kb<Torus>(
+        streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
+        divisor, bsks, ksks, int_mem_ptr->unsigned_mem, num_blocks);
+  }
+}
+
 #endif // TFHE_RS_DIV_REM_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,4 +1,5 @@
 #include "integer/integer.cuh"
+#include "integer/negation.cuh"
 #include <linear_algebra.h>

 void cuda_full_propagation_64_inplace(void *const *streams,
@@ -49,7 +50,8 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -59,30 +61,94 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
+      requested_flag, uses_carry, allocate_gpu_memory);
+}
+
+void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      requested_flag, uses_carry, allocate_gpu_memory);
+}
+
+void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_overflowing_sub<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      compute_overflow, allocate_gpu_memory);
 }

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry) {
+
  host_propagate_single_carry<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
-      nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), num_blocks);
+      static_cast<const uint64_t *>(carry_in),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
 }

-void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks) {
-  host_propagate_single_carry<uint64_t>(
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry) {
+
+  host_add_and_propagate_single_carry<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
-      static_cast<uint64_t *>(input_carries),
+      static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      num_blocks);
+      num_blocks, requested_flag, uses_carry);
+}
+
+void cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+
+  host_integer_overflowing_sub<uint64_t>(
+      (cudaStream_t const *)streams, gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(overflow_block),
+      static_cast<const uint64_t *>(input_borrow),
+      (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
+      num_blocks, compute_overflow, uses_input_borrow);
 }

 void cleanup_cuda_propagate_single_carry(void *const *streams,
@@ -94,6 +160,23 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

+void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void) {
+  int_sc_prop_memory<uint64_t> *mem_ptr =
+      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+void cleanup_cuda_integer_overflowing_sub(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void) {
+  int_borrow_prop_memory<uint64_t> *mem_ptr =
+      (int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
 void scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -142,14 +225,14 @@ void cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks, uint32_t num_blocks,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {

  host_apply_many_univariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
      static_cast<const uint64_t *>(input_radix_lwe),
      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
-      lut_count, lut_stride);
+      num_many_lut, lut_stride);
 }

 void scratch_cuda_apply_bivariate_lut_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -67,11 +67,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
 */
 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          polynomial_size * glwe_dimension, lwe_dimension,
@@ -88,8 +89,8 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
  case 16384:
    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
        (cudaStream_t const *)(streams), gpu_indexes, gpu_count,
-        (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
-        allocate_gpu_memory);
+        (int_mul_memory<uint64_t> **)mem_ptr, is_boolean_left, is_boolean_right,
+        num_radix_blocks, params, allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -126,65 +127,66 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 */
 void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left,
-    void const *radix_lwe_right, void *const *bsks, void *const *ksks,
-    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
+    void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
+    void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
+    void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks) {

  switch (polynomial_size) {
  case 256:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 512:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 1024:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 4096:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 8192:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 16384:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -9,6 +9,7 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "helper_multi_gpu.h"
+#include "integer/cmux.cuh"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
 #include "linear_algebra.h"
@@ -208,7 +209,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  auto small_lwe_size = small_lwe_dimension + 1;

  // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;

  if (num_radix_in_vec == 0)
@@ -266,8 +267,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
        2 * ch_amount * num_blocks, reused_lut);
  }
-  auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
-  auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
+  auto message_acc = luts_message_carry->get_lut(0, 0);
+  auto carry_acc = luts_message_carry->get_lut(0, 1);

  // define functions for each accumulator
  auto lut_f_message = [message_modulus](Torus x) -> Torus {
@@ -284,7 +285,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  generate_device_accumulator<Torus>(
      streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
      message_modulus, carry_modulus, lut_f_carry);
-  luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

  while (r > 2) {
    size_t cur_total_blocks = r * num_blocks;
@@ -333,10 +334,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    if (carry_count > 0)
      cuda_set_value_async<Torus>(
          streams[0], gpu_indexes[0],
-          luts_message_carry->get_lut_indexes(gpu_indexes[0], message_count), 1,
+          luts_message_carry->get_lut_indexes(0, message_count), 1,
          carry_count);

-    luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

    /// For multi GPU execution we create vectors of pointers for inputs and
    /// outputs
@@ -369,7 +370,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, lut_count, lut_stride);
+          mem_ptr->params.pbs_type, num_many_lut, lut_stride);
    } else {
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);

@@ -417,7 +418,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, lut_count, lut_stride);
+          mem_ptr->params.pbs_type, num_many_lut, lut_stride);

      multi_gpu_gather_lwe_async<Torus>(
          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -453,7 +454,8 @@ template <typename Torus, class params>
 __host__ void host_integer_mult_radix_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
-    uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
+    bool const is_bool_left, uint64_t const *radix_lwe_right,
+    bool const is_bool_right, void *const *bsks, uint64_t *const *ksks,
    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

  auto glwe_dimension = mem_ptr->params.glwe_dimension;
@@ -464,6 +466,20 @@ __host__ void host_integer_mult_radix_kb(

  int big_lwe_dimension = glwe_dimension * polynomial_size;

+  if (is_bool_right) {
+    zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
+                       radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
+                       mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
+    return;
+  }
+
+  if (is_bool_left) {
+    zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
+                       radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
+                       mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
+    return;
+  }
+
  // 'vector_result_lsb' contains blocks from all possible right shifts of
  // radix_lwe_left, only nonzero blocks are kept
  int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
@@ -562,19 +578,26 @@ __host__ void host_integer_mult_radix_kb(
      terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
      2 * num_blocks, mem_ptr->luts_array);

-  auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-                                     radix_lwe_out, nullptr, nullptr,
-                                     scp_mem_ptr, bsks, ksks, num_blocks);
+  uint32_t block_modulus = message_modulus * carry_modulus;
+  uint32_t num_bits_in_block = log2_int(block_modulus);
+
+  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
+  uint32_t requested_flag = outputFlag::FLAG_NONE;
+  uint32_t uses_carry = 0;
+  host_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
+      scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
+    bool const is_boolean_left, bool const is_boolean_right,
    uint32_t num_radix_blocks, int_radix_params params,
    bool allocate_gpu_memory) {
  *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                       is_boolean_left, is_boolean_right,
                                       num_radix_blocks, allocate_gpu_memory);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -12,49 +12,3 @@ void cuda_negate_integer_radix_ciphertext_64(
      static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
      lwe_ciphertext_count, message_modulus, carry_modulus);
 }
-
-void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
-}
-
-void cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
-    void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
-
-  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
-
-  host_integer_overflowing_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(radix_lwe_out),
-      static_cast<uint64_t *>(radix_lwe_overflowed),
-      static_cast<const uint64_t *>(radix_lwe_left),
-      static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-      mem, num_blocks);
-}
-
-void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void) {
-  int_overflowing_sub_memory<uint64_t> *mem_ptr =
-      (int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }
-
+/*
 template <typename Torus>
 __host__ void host_integer_overflowing_sub_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -113,4 +113,39 @@ __host__ void host_integer_overflowing_sub_kb(
                                          mem_ptr, bsks, ksks, num_blocks);
 }

+*/
+template <typename Torus>
+__host__ void host_integer_overflowing_sub(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
+    const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
+    int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
+    Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+
+  auto radix_params = mem_ptr->params;
+
+  // We need to recalculate the num_groups, because on the division the number
+  // of num_blocks changes
+  uint32_t block_modulus =
+      radix_params.message_modulus * radix_params.carry_modulus;
+  uint32_t num_bits_in_block = log2_int(block_modulus);
+  uint32_t grouping_size = num_bits_in_block;
+  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
+
+  auto stream = (cudaStream_t *)streams;
+  host_unchecked_sub_with_correcting_term<Torus>(
+      stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
+      static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
+      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
+      radix_params.carry_modulus, radix_params.message_modulus - 1);
+
+  host_single_borrow_propagate<Torus>(
+      streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
+      static_cast<Torus *>(overflow_block),
+      static_cast<const Torus *>(input_borrow),
+      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
+      num_blocks, num_groups, compute_overflow, uses_input_borrow);
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -31,10 +31,10 @@ __host__ void host_integer_radix_scalar_bitop_kb(
  } else {
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
-    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(gpu_indexes[0], 0),
-                                 clear_blocks, num_clear_blocks * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
-    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
+                                 num_clear_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -110,11 +110,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        scalar_last_leaf_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    generate_device_accumulator<Torus>(streams[0], gpu_indexes[0],
+                                       lut->get_lut(0, 0), glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, scalar_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -194,10 +194,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -292,7 +292,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus const *sign_block =
        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;

-    auto sign_bit_pos = (int)std::log2(message_modulus) - 1;
+    auto sign_bit_pos = (int)log2_int(message_modulus) - 1;

    auto scalar_last_leaf_with_respect_to_zero_lut_f =
        [sign_handler_f, sign_bit_pos,
@@ -329,10 +329,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -422,11 +422,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto signed_msb_lut = mem_ptr->signed_msb_lut;
    generate_device_accumulator_bivariate<Torus>(
-        msb_streams[0], gpu_indexes[0],
-        signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        lut_f);
-    signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+        msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, lut_f);
+    signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);

    Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
@@ -676,10 +675,10 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
                       scalar_blocks, 0, num_scalar_blocks, message_modulus);

-    cuda_memcpy_async_gpu_to_gpu(
-        scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
-        packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
-        gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(scalar_comparison_luts->get_lut_indexes(0, 0),
+                                 packed_scalar,
+                                 num_halved_scalar_blocks * sizeof(Torus),
+                                 lsb_streams[0], gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -54,7 +54,7 @@ __host__ void host_integer_scalar_mul_radix(
  // whereas lwe_dimension is the number of elements in the mask
  uint32_t lwe_size = input_lwe_dimension + 1;
  uint32_t lwe_size_bytes = lwe_size * sizeof(T);
-  uint32_t msg_bits = (uint32_t)std::log2(message_modulus);
+  uint32_t msg_bits = log2_int(message_modulus);
  uint32_t num_ciphertext_bits = msg_bits * num_radix_blocks;

  T *preshifted_buffer = mem->preshifted_buffer;
@@ -112,10 +112,12 @@ __host__ void host_integer_scalar_mul_radix(
        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
        num_radix_blocks, j, nullptr);

-    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
-    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
-                                   nullptr, nullptr, scp_mem_ptr, bsks, ksks,
-                                   num_radix_blocks);
+    auto scp_mem_ptr = mem->sc_prop_mem;
+    uint32_t requested_flag = outputFlag::FLAG_NONE;
+    uint32_t uses_carry = 0;
+    host_propagate_single_carry<T>(
+        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
+        scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -38,7 +38,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

-  size_t num_bits_in_message = (size_t)log2(message_modulus);
+  size_t num_bits_in_message = (size_t)log2_int(message_modulus);
  size_t total_num_bits = num_bits_in_message * num_blocks;
  n = n % total_num_bits;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -38,7 +38,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

-  size_t num_bits_in_block = (size_t)log2(message_modulus);
+  size_t num_bits_in_block = (size_t)log2_int(message_modulus);
  size_t total_num_bits = num_bits_in_block * num_blocks;
  shift = shift % total_num_bits;

@@ -141,7 +141,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

-  size_t num_bits_in_block = (size_t)log2(message_modulus);
+  size_t num_bits_in_block = (size_t)log2_int(message_modulus);
  size_t total_num_bits = num_bits_in_block * num_blocks;
  shift = shift % total_num_bits;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -29,7 +29,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    uint32_t gpu_count, Torus *lwe_array, Torus const *lwe_shift,
    int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
    Torus *const *ksks, uint32_t num_radix_blocks) {
-  uint32_t bits_per_block = std::log2(mem->params.message_modulus);
+  uint32_t bits_per_block = log2_int(mem->params.message_modulus);
  uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
  if (total_nb_bits == 0)
    return;
@@ -55,7 +55,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  // then the behaviour of shifting won't be the same
  // if shift >= total_nb_bits compared to when total_nb_bits
  // is a power of two, as will 'capture' more bits in `shift_bits`
-  uint32_t max_num_bits_that_tell_shift = std::log2(total_nb_bits);
+  uint32_t max_num_bits_that_tell_shift = log2_int(total_nb_bits);
  if (!is_power_of_two(total_nb_bits))
    max_num_bits_that_tell_shift += 1;
  // Extracts bits and put them in the bit index 2 (=> bit number 3)
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -57,6 +57,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                          static_cast<const uint64_t *>(lwe_array_in_2),
                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
+
 /*
 * Perform the addition of a u32 input LWE ciphertext vector with a u32
 * plaintext vector. See the equivalent operation on u64 data for more details.
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -82,6 +82,46 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
  check_cuda_error(cudaGetLastError());
 }

+template <typename T>
+__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
+                                         T const *input_2, uint32_t num_entries,
+                                         uint32_t message_modulus) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = input_1[index] * message_modulus + input_2[index];
+  }
+}
+
+template <typename T>
+__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
+                                            uint32_t gpu_index, T *output,
+                                            T const *input_1, T const *input_2,
+                                            uint32_t input_lwe_dimension,
+                                            uint32_t input_lwe_ciphertext_count,
+                                            uint32_t message_modulus) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
+      &output[(input_lwe_ciphertext_count - 1) * lwe_size],
+      &input_1[(input_lwe_ciphertext_count - 1) * lwe_size],
+      &input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size,
+      message_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void subtraction(T *output, T const *input_1, T const *input_2,
                            uint32_t num_entries) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -92,7 +92,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                       uint32_t polynomial_size, uint32_t base_log,
                       uint32_t level_count, uint32_t grouping_factor,
                       uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
-                       uint32_t lut_count, uint32_t lut_stride) {
+                       uint32_t num_many_lut, uint32_t lut_stride) {

  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
@@ -126,7 +126,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            lut_count, lut_stride);
+            num_many_lut, lut_stride);
      }
      break;
    default:
@@ -165,7 +165,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
            polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, lut_count, lut_stride);
+            num_inputs_on_gpu, num_many_lut, lut_stride);
      }
      break;
    case CLASSICAL:
@@ -194,7 +194,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            lut_count, lut_stride);
+            num_many_lut, lut_stride);
      }
      break;
    default:
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -45,7 +45,7 @@ __global__ void device_programmable_bootstrap_cg(
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t lut_count,
+    uint64_t device_memory_size_per_block, uint32_t num_many_lut,
    uint32_t lut_stride) {

  grid_group grid = this_grid();
@@ -152,8 +152,8 @@ __global__ void device_programmable_bootstrap_cg(
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
          auto next_lwe_array_out =
              lwe_array_out +
              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -168,8 +168,8 @@ __global__ void device_programmable_bootstrap_cg(
      }
    } else if (blockIdx.y == glwe_dimension) {
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {

          auto next_lwe_array_out =
              lwe_array_out +
@@ -235,7 +235,7 @@ __host__ void host_programmable_bootstrap_cg(
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
@@ -273,7 +273,7 @@ __host__ void host_programmable_bootstrap_cg(
  kernel_args[10] = &base_log;
  kernel_args[11] = &level_count;
  kernel_args[12] = &d_mem;
-  kernel_args[14] = &lut_count;
+  kernel_args[14] = &num_many_lut;
  kernel_args[15] = &lut_stride;

  if (max_shared_memory < partial_sm) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -32,7 +32,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
-        uint32_t lut_count, uint32_t lut_stride) {
+        uint32_t num_many_lut, uint32_t lut_stride) {
+
  grid_group grid = this_grid();

  // We use shared memory for the polynomials that are used often during the
@@ -134,8 +135,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        // default
        sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);

-        if (lut_count > 1) {
-          for (int i = 1; i < lut_count; i++) {
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
            auto next_lwe_array_out =
                lwe_array_out +
                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -153,8 +154,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)

        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

-        if (lut_count > 1) {
-          for (int i = 1; i < lut_count; i++) {
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {

            auto next_lwe_array_out =
                lwe_array_out +
@@ -293,7 +294,7 @@ __host__ void execute_cg_external_product_loop(
    Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
+    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {

  uint64_t full_sm =
@@ -343,7 +344,7 @@ __host__ void execute_cg_external_product_loop(
  kernel_args[16] = &chunk_size;
  kernel_args[17] = &keybundle_size_per_input;
  kernel_args[18] = &d_mem;
-  kernel_args[20] = &lut_count;
+  kernel_args[20] = &num_many_lut;
  kernel_args[21] = &lut_stride;

  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
@@ -379,7 +380,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {

  auto lwe_chunk_size = buffer->lwe_chunk_size;

@@ -397,7 +398,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, lut_count,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
        lut_stride);
  }
 }
--- a/Show More
+++ b/Show More