fix precision

fix some bugs
feat(gpu): Implement fft128 in cuda backend
2026-04-28 03:01:21 -04:00 · 2025-02-05 20:00:20 +04:00 · 2025-02-03 16:02:57 +04:00 · 2025-01-20 15:43:19 +04:00 · 2024-11-25 11:06:28 +01:00 · 2024-11-25 11:06:19 +01:00
420 changed files with 91896 additions and 11010 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,2 +1,6 @@
 [alias]
 xtask = "run --manifest-path ./tasks/Cargo.toml --"
+
+# Accessed by wasm-bindgen when testing for the wasm target
+[target.wasm32-unknown-unknown]
+runner = 'wasm-bindgen-test-runner'
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -100,7 +100,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -57,18 +57,19 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
+              - tfhe-fft/**
              - tfhe-zk-pok/**
              - utils/tfhe-versionable/**
              - utils/tfhe-versionable-derive/**
            csprng:
-              - concrete-csprng/**
+              - tfhe-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
            versionable:
@@ -131,7 +132,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -157,14 +158,14 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

-      - name: Run concrete-csprng tests
+      - name: Run tfhe-csprng tests
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
-          make test_concrete_csprng
+          make test_tfhe_csprng

      - name: Run tfhe-zk-pok tests
        if: needs.should-run.outputs.zk_pok_test == 'true'
@@ -244,9 +245,13 @@ jobs:
        run: |
          make test_high_level_api

-      - name: Run safe deserialization tests
+      - name: Run safe serialization tests
        run: |
-          make test_safe_deserialization
+          make test_safe_serialization
+
+      - name: Run zk tests
+        run: |
+          make test_zk

      - name: Slack Notification
        if: ${{ failure() }}
@@ -264,7 +269,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -46,13 +46,14 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
+              - tfhe-fft/**
              - tfhe-zk-pok/**
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
@@ -72,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -96,7 +97,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -137,7 +138,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -46,13 +46,14 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
+              - tfhe-fft/**
              - tfhe-zk-pok/**
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
@@ -72,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -96,7 +97,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -141,7 +142,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -40,6 +40,9 @@ jobs:
      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.shortint_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
+      strings_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.strings_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
      high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.high_level_api_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -63,16 +66,17 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
+              - tfhe-fft/**
              - tfhe-zk-pok/**
            csprng:
-              - concrete-csprng/**
+              - tfhe-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
            core_crypto:
@@ -83,6 +87,11 @@ jobs:
            shortint:
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
+            strings:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+              - tfhe/src/strings/**
            high_level_api:
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
@@ -112,6 +121,7 @@ jobs:
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
+          steps.changed-files.outputs.strings_any_changed == 'true' ||
          steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
          steps.changed-files.outputs.c_api_any_changed == 'true' ||
          steps.changed-files.outputs.examples_any_changed == 'true' ||
@@ -131,7 +141,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -157,14 +167,14 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

-      - name: Run concrete-csprng tests
+      - name: Run tfhe-csprng tests
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
-          make test_concrete_csprng
+          make test_tfhe_csprng

      - name: Run tfhe-zk-pok tests
        if: needs.should-run.outputs.zk_pok_test == 'true'
@@ -201,6 +211,11 @@ jobs:
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_shortint_ci

+      - name: Run strings tests
+        if: needs.should-run.outputs.strings_test == 'true'
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_strings
+
      - name: Run high-level API tests
        if: needs.should-run.outputs.high_level_api_test == 'true'
        run: |
@@ -234,7 +249,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -99,6 +99,10 @@ jobs:
        run: |
          make test_web_js_api_parallel_chrome_ci

+      - name: Run x86_64/wasm zk compatibility tests
+        run: |
+          make test_zk_wasm_x86_compat_ci
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
@@ -115,7 +119,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -61,13 +61,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -85,8 +80,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      - name: Measure key sizes
        run: |
@@ -133,7 +127,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -57,13 +57,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -83,8 +78,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --name-suffix avx512 \
-          --walk-subdirs \
-          --throughput
+          --walk-subdirs

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -121,7 +115,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,13 +62,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -129,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -54,7 +54,7 @@ jobs:
          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -79,8 +79,7 @@ jobs:
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --throughput
+          --walk-subdirs

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -127,7 +126,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -154,7 +153,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --throughput
+      

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,6 @@ jobs:
          make -j"$(nproc)"
          sudo make install

-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
@@ -85,7 +84,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -126,8 +125,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --name-suffix avx512 \
-          --walk-subdirs \
-          --throughput
+          --walk-subdirs

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
@@ -169,7 +167,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -1,195 +1,41 @@
-# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: ERC20 GPU H100 benchmarks
+# Run CUDA ERC20 benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
+name: Cuda ERC20 benchmarks

 on:
  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 5a.m.
-    - cron: '0 5 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "multi-h100 (n3-H100x8)"

 jobs:
-  setup-instance:
-    name: Setup instance (cuda-erc20-benchmarks)
+  parse-inputs:
    runs-on: ubuntu-latest
-    if:  github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-erc20-benchmarks:
-    name: Execute GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
+      - name: Parse profile
+        id: parse_profile
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
+      - name: Parse hardware name
+        id: parse_hardware_name
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks
-        run: |
-          make bench_hlapi_erc20_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512
-
-      - name: Parse PBS counts
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
-          --object-sizes \
-          --append-results
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_erc20
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-erc20-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-erc20-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+  run-benchmarks:
+    name: Run benchmarks
+    needs: parse-inputs
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: ${{ needs.parse-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_integer_2H100_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_2H100_full.yml
@@ -1,15 +1,40 @@
-# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer 2xH100 benchmarks
+# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Cuda ERC20 benchmarks - common

 on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
+  workflow_call:
+    inputs:
+      backend:
+        type: string
+        default: hyperstack
+      profile:
+        type: string
+        required: true
+      hardware_name:
+        type: string
+        required: true
+    secrets:
+      FHE_ACTIONS_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
@@ -20,37 +45,32 @@ env:

 jobs:
  setup-instance:
-    name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
+    name: Setup instance (cuda-erc20-benchmarks)
    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
+    if:  github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: 2-h100
+          backend: ${{ inputs.backend }}
+          profile: ${{ inputs.profile }}

-  cuda-integer-full-2-gpu-benchmarks:
-    name: Execute 2xH100 integer benchmarks
+  cuda-erc20-benchmarks:
+    name: Cuda ERC20 benchmarks (${{ inputs.profile }})
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
    strategy:
      fail-fast: false
-      max-parallel: 1
+      # explicit include-based build matrix, of known valid options
      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
@@ -63,7 +83,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -91,7 +111,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -115,41 +135,40 @@ jobs:
            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
          } >> "${GITHUB_ENV}"

-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
      - name: Check device is detected
        if: ${{ !cancelled() }}
        run: nvidia-smi

-      - name: Run benchmarks with AVX512
+      - name: Run benchmarks
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+          make bench_hlapi_erc20_gpu

      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n3-H100x2" \
+          --hardware "${{ inputs.hardware_name }}" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_erc20
          path: ${{ env.RESULTS_FILENAME }}

+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
      - name: Send data to Slab
        shell: bash
        run: |
@@ -158,26 +177,26 @@ jobs:

  slack-notify:
    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
+    needs: [ setup-instance, cuda-erc20-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
+          SLACK_MESSAGE: "Cuda ERC20 benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
+    name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
+    needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -191,4 +210,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-erc20-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_erc20_weekly.yml
+++ b/.github/workflows/benchmark_gpu_erc20_weekly.yml
@@ -0,0 +1,35 @@
+# Run CUDA ERC20 benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+name: Cuda ERC20 weekly benchmarks
+
+on:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 5a.m.
+    - cron: '0 5 * * 6'
+
+jobs:
+  run-benchmarks-1-h100:
+    name: Run benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+    secrets: inherit
+
+  run-benchmarks-2-h100:
+    name: Run benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+    secrets: inherit
+
+  run-benchmarks-8-h100:
+    name: Run benchmarks (8xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100x8
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -1,201 +1,78 @@
-# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer GPU benchmarks
+# Run CUDA benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
+name: Cuda benchmarks

 on:
  workflow_dispatch:
-  push:
-    branches:
-      - main
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+      command:
+        description: "Benchmark command to run"
+        type: choice
+        default: integer_multi_bit
+        options:
+          - integer
+          - integer_multi_bit
+          - integer_compression
+          - pbs
+          - ks
+      op_flavor:
+        description: "Operations set to run"
+        type: choice
+        default: default
+        options:
+          - default
+          - fast_default
+          - unchecked
+      all_precisions:
+        description: "Run all precisions"
+        type: boolean
+        default: false
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both

 jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-benchmarks)
+  parse-inputs:
    runs-on: ubuntu-latest
-    if:  github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-integer-benchmarks:
-    name: Execute GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
+      - name: Parse profile
+        id: parse_profile
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
+      - name: Parse hardware name
+        id: parse_hardware_name
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+  run-benchmarks:
+    name: Run benchmarks
+    needs: parse-inputs
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: ${{ needs.parse-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
+      command: ${{ inputs.command }}
+      op_flavor: ${{ inputs.op_flavor }}
+      bench_type: ${{ inputs.bench_type }}
+      all_precisions: ${{ inputs.all_precisions }}
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
@@ -1,21 +1,47 @@
-# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer multi GPU Multi-bit benchmarks
+# Run integer benchmarks on CUDA instance and return parsed results to Slab CI bot.
+name: Cuda benchmarks - common

 on:
-  workflow_dispatch:
+  workflow_call:
    inputs:
+      backend:
+        type: string
+        default: hyperstack
+      profile:
+        type: string
+        required: true
+      hardware_name:
+        type: string
+        required: true
+      command: # Use a comma separated values to generate an array
+        type: string
+        required: true
+      op_flavor: # Use a comma separated values to generate an array
+        type: string
+        required: true
+      bench_type:
+        type: string
+        default: latency
      all_precisions:
-        description: "Run all precisions"
        type: boolean
        default: false
-      fast_default:
-        description: "Run only deduplicated default operations without scalar variants"
-        type: boolean
-        default: false
-
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
+    secrets:
+      FHE_ACTIONS_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true

 env:
  CARGO_TERM_COLOR: always
@@ -28,32 +54,82 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE
-  BENCH_OP_FLAVOR: default

 jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+  prepare-matrix:
+    name: Prepare operations matrix
+    runs-on: ubuntu-latest
+    outputs:
+      command: ${{ steps.set_command.outputs.command }}
+      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
+    steps:
+      - name: Set single command
+        if: ${{ !contains(inputs.command, ',')}}
+        run: |
+          echo "COMMAND=[\"${{ inputs.command }}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set multiple commands
+        if: ${{ contains(inputs.command, ',')}}
+        run: |
+          PARSED_COMMAND=$(echo "${{ inputs.command }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
+          echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set single operations flavor
+        if: ${{ !contains(inputs.op_flavor, ',')}}
+        run: |
+          echo "OP_FLAVOR=[\"${{ inputs.op_flavor }}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set multiple operations flavors
+        if: ${{ contains(inputs.op_flavor, ',')}}
+        run: |
+          PARSED_OP_FLAVOR=$(echo "${{ inputs.op_flavor }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
+          echo "OP_FLAVOR=[\"${PARSED_OP_FLAVOR}\"]" >> "${GITHUB_ENV}"
+
+      - name: Set benchmark types
+        run: |
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Set command output
+        id: set_command
+        run: |
+          echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Set operation flavor output
+        id: set_op_flavor
+        run: |
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+
+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
+    needs: prepare-matrix
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch' }}
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: multi-h100
+          backend: ${{ inputs.backend }}
+          profile: ${{ inputs.profile }}

-  cuda-integer-multi-bit-multi-gpu-benchmarks:
-    name: Execute multi GPU integer multi-bit benchmarks
-    needs: setup-instance
+  cuda-benchmarks:
+    name: Cuda benchmarks (${{ inputs.profile }})
+    needs: [ prepare-matrix, setup-instance ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
    continue-on-error: true
@@ -61,6 +137,10 @@ jobs:
      fail-fast: false
      max-parallel: 1
      matrix:
+        command: ${{ fromJSON(needs.prepare-matrix.outputs.command) }}
+        op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
+        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
@@ -101,7 +181,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -132,42 +212,36 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
        run: |
          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"

-      - name: Should run fast subset benchmarks
-        if: inputs.fast_default
+      - name: Run benchmarks
        run: |
-          echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make bench_unsigned_integer_multi_bit_gpu
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}_gpu

      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n3-H100x8" \
+          --hardware "${{ inputs.hardware_name }}" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_integer
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -178,26 +252,26 @@ jobs:

  slack-notify:
    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    needs: [ setup-instance, cuda-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
+    if: ${{ always() && needs.cuda-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-benchmarks.result }}
+          SLACK_MESSAGE: "Cuda benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+    name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    needs: [ setup-instance, cuda-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -211,4 +285,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_full.yml
@@ -1,200 +0,0 @@
-# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer GPU full benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-full-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-integer-full-benchmarks:
-    name: Execute GPU integer benchmarks for all operations flavor
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      # Run these benchmarks only once
-      - name: Run compression benchmarks with AVX512
-        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
-        run: |
-          make bench_integer_compression_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_bit.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit.yml
@@ -1,224 +0,0 @@
-# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
-name: Integer GPU Multi-bit benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      all_precisions:
-        description: "Run all precisions"
-        type: boolean
-        default: false
-      fast_default:
-        description: "Run only deduplicated default operations without scalar variants"
-        type: boolean
-        default: false
-
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  FAST_BENCH: TRUE
-  BENCH_OP_FLAVOR: default
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-multi-bit-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
-
-  cuda-integer-multi-bit-benchmarks:
-    name: Execute GPU integer multi-bit benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Should run benchmarks with all precisions
-        if: inputs.all_precisions
-        run: |
-          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
-
-      - name: Should run fast subset benchmarks
-        if: inputs.fast_default
-        run: |
-          echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make bench_unsigned_integer_multi_bit_gpu
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
@@ -1,194 +0,0 @@
-# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Integer multi GPU full benchmarks
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-integer-full-multi-gpu-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: multi-h100
-
-  cuda-integer-full-multi-gpu-benchmarks:
-    name: Execute multi GPU integer benchmarks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-H100x8" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
-          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_weekly.yml
+++ b/.github/workflows/benchmark_gpu_integer_weekly.yml
@@ -0,0 +1,60 @@
+# Run CUDA benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+name: Cuda weekly benchmarks
+
+on:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+jobs:
+  run-benchmarks-1-h100:
+    name: Run benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: integer,integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
+
+  run-benchmarks-2-h100:
+    name: Run benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+      command: integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
+
+  run-benchmarks-8-h100:
+    name: Run benchmarks (8xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100x8
+      command: integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
+
+  run-benchmarks-l40:
+    name: Run benchmarks (L40)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_integer_common.yml
+    with:
+      profile: l40
+      hardware_name: n3-L40x1
+      command: integer_multi_bit,integer_compression,pbs,ks
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_l40.yml
+++ b/.github/workflows/benchmark_gpu_l40.yml
@@ -1,206 +0,0 @@
-# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
-name: Cuda benchmarks (L40)
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-l40-benchmarks)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: l40 
-
-  cuda-l40-benchmarks:
-    name: Cuda benchmarks (L40)
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer_multi_bit]
-        op_flavor: [default]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
-    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Run compression benchmarks with AVX512
-        run: |
-          make bench_integer_compression_gpu
-
-      - name: Run PBS benchmarks 
-        run: |
-          make bench_pbs_gpu
-
-      - name: Run KS benchmarks 
-        run: |
-          make bench_ks_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "n3-L40x1" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-l40-benchmarks ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
-          SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: Teardown instance (cuda-l40-benchmarks)
-    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -8,6 +8,14 @@ on:
        description: "Run all precisions"
        type: boolean
        default: false
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both

  schedule:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
@@ -27,6 +35,7 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE
+  BENCH_TYPE: latency

 jobs:
  prepare-matrix:
@@ -36,10 +45,10 @@ jobs:
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
    steps:
      - name: Weekly benchmarks
-        if: github.event_name == 'workflow_dispatch' ||
-          github.event.schedule == '0 1 * * 6'
+        if: github.event.schedule == '0 1 * * 6'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

@@ -48,11 +57,31 @@ jobs:
        run: |
          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"

-      -  name: Set operation flavor output
-         id: set_op_flavor
-         run: |
+      - name: Set benchmark types
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Default benchmark type
+        if: github.event_name != 'workflow_dispatch'
+        run: |
+          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
+
+      - name: Set operation flavor output
+        id: set_op_flavor
+        run: |
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  setup-instance:
    name: Setup instance (integer-benchmarks)
    needs: prepare-matrix
@@ -62,7 +91,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -85,6 +114,7 @@ jobs:
      matrix:
        command: [ integer, integer_multi_bit]
        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -100,13 +130,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -124,13 +149,13 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}

-      # Run these benchmarks only once
+      # Run these benchmarks only once per benchmark type
      - name: Run compression benchmarks with AVX512
        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
        run: |
-          make bench_integer_compression
+          make BENCH_TYPE=${{ matrix.bench_type }} bench_integer_compression

      - name: Parse results
        run: |
@@ -143,12 +168,12 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ env.BENCH_TYPE }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -173,7 +198,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -92,13 +92,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -125,8 +120,7 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
+          --name-suffix avx512

      # This small benchmark needs to be executed only once.
      - name: Measure key sizes
@@ -169,7 +163,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -8,6 +8,14 @@ on:
        description: "Run all precisions"
        type: boolean
        default: false
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: latency
+        options:
+          - latency
+          - throughput
+          - both

  schedule:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
@@ -36,10 +44,10 @@ jobs:
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
    steps:
      - name: Weekly benchmarks
-        if: github.event_name == 'workflow_dispatch' ||
-          github.event.schedule == '0 1 * * 6'
+        if: github.event.schedule == '0 1 * * 6'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

@@ -48,11 +56,31 @@ jobs:
        run: |
          echo "OP_FLAVOR=[\"default\", \"unchecked\"]" >> "${GITHUB_ENV}"

+      - name: Set benchmark types
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+          fi
+
+      - name: Default benchmark type
+        if: github.event_name != 'workflow_dispatch'
+        run: |
+          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
+
      - name: Set operation flavor output
        id: set_op_flavor
        run: |
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

+      - name: Set benchmark types output
+        id: set_bench_type
+        run: |
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  setup-instance:
    name: Setup instance (signed-integer-benchmarks)
    needs: prepare-matrix
@@ -62,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +112,8 @@ jobs:
      max-parallel: 1
      matrix:
        command: [ integer, integer_multi_bit ]
-        op_flavor: [ default, unchecked ]
+        op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -100,13 +129,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -124,7 +148,7 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_signed_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -137,12 +161,12 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
@@ -167,7 +191,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -0,0 +1,140 @@
+# Run FFT benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: FFT benchmarks
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  RUST_BACKTRACE: "full"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+  schedule:
+    # Job will be triggered each Thursday at 11p.m.
+    - cron: '0 23 * * 4'
+
+jobs:
+  setup-ec2:
+    name: Setup EC2 instance (fft-benchmarks)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  fft-benchmarks:
+    name: Execute FFT benchmarks in EC2
+    needs: setup-ec2
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make bench_fft
+
+      - name: Parse AVX512 results
+        run: |
+          python3 ./ci/fft_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database concrete_fft \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        with:
+          name: ${{ github.sha }}_fft
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-fft benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-ec2:
+    name: Teardown EC2 instance (fft-benchmarks)
+    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
+    needs: [ setup-ec2, fft-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-ec2.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "EC2 teardown (fft-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -0,0 +1,140 @@
+# Run NTT benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: NTT benchmarks
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  RUST_BACKTRACE: "full"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+  schedule:
+    # Job will be triggered each Friday at 11p.m.
+    - cron: "0 23 * * 5"
+
+jobs:
+  setup-ec2:
+    name: Setup EC2 instance (ntt-benchmarks)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  ntt-benchmarks:
+    name: Execute NTT benchmarks in EC2
+    needs: setup-ec2
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Run benchmarks
+        run: |
+          make bench_ntt
+
+      - name: Parse results
+        run: |
+          python3 ./ci/ntt_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database concrete_ntt \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        with:
+          name: ${{ github.sha }}_ntt
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-ntt benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-ec2:
+    name: Teardown EC2 instance (ntt-benchmarks)
+    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
+    needs: [setup-ec2, ntt-benchmarks]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-ec2.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -0,0 +1,173 @@
+# Run benchmarks of the tfhe-zk-pok crate on an instance and return parsed results to Slab CI bot.
+name: tfhe-zk-pok benchmarks
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 3a.m.
+    - cron: '0 3 * * 6'
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch' ||
+      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            zk_pok:
+              - tfhe-zk-pok/**
+              - .github/workflows/benchmark_tfhe_zk_pok.yml
+
+  setup-instance:
+    name: Setup instance (tfhe-zk-pok-benchmarks)
+    runs-on: ubuntu-latest
+    needs: should-run
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'push' &&
+      github.repository == 'zama-ai/tfhe-rs' &&
+      needs.should-run.outputs.zk_pok_changed == 'true')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  tfhe-zk-pok-benchmarks:
+    name: Execute tfhe-zk-pok benchmarks
+    if: needs.setup-instance.result != 'skipped'
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_tfhe_zk_pok
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --crate tfhe-zk-pok \
+          --hardware "hpc7a.96xlarge" \
+          --backend cpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        with:
+          name: ${{ github.sha }}_tfhe_zk_pok
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-zk-pok benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (tfhe-zk-pok-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, tfhe-zk-pok-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (tfhe-zk-pok-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,13 +39,13 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
            wasm_bench:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
              - tfhe-zk-pok/**
              - tfhe/src/**
              - '!tfhe/src/c_api/**'
@@ -64,7 +64,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -98,7 +98,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -199,7 +199,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -3,6 +3,12 @@ name: PKE ZK benchmarks

 on:
  workflow_dispatch:
+    inputs:
+      run_throughput:
+        description: "Run throughput benchmarks"
+        type: boolean
+        default: false
+
  push:
    branches:
      - main
@@ -20,6 +26,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  BENCH_TYPE: latency

 jobs:
  should-run:
@@ -36,13 +43,14 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
            zk_pok:
              - tfhe/Cargo.toml
-              - concrete-csprng/**
+              - tfhe-csprng/**
+              - tfhe-fft/**
              - tfhe-zk-pok/**
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
@@ -65,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,13 +105,8 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
      - name: Install rust
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -114,6 +117,11 @@ jobs:
          path: slab
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Should run throughput benchmarks
+        if: inputs.run_throughput
+        run: |
+          echo "BENCH_TYPE=throughput" >> "${GITHUB_ENV}"
+
      - name: Run benchmarks with AVX512
        run: |
          make bench_integer_zk
@@ -130,7 +138,7 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --throughput
+          --bench-type ${{ env.BENCH_TYPE }}

      - name: Parse CRS sizes results
        run: |
@@ -173,7 +181,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -1,4 +1,4 @@
-name: Cargo Build
+name: Cargo Build TFHE-rs

 on:
  pull_request:
@@ -28,7 +28,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -47,10 +47,10 @@ jobs:
        run: |
          make pcc

-      - name: Build concrete-csprng
+      - name: Build tfhe-csprng
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
-          make build_concrete_csprng
+          make build_tfhe_csprng

      - name: Build Release core
        if: ${{ contains(matrix.os, 'ubuntu') }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -0,0 +1,44 @@
+# Build tfhe-fft
+name: Cargo Build tfhe-fft
+
+on:
+  pull_request:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  cargo-builds-fft:
+    runs-on: ${{ matrix.runner_type }}
+
+    strategy:
+      matrix:
+        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Run pcc checks
+        if: matrix.runner_type == 'ubuntu-latest'
+        run: |
+          sudo apt install -y libfftw3-dev
+          make pcc_fft
+
+      - name: Build release
+        run: |
+          make build_fft
+
+      - name: Build release no-std
+        run: |
+          make build_fft_no_std
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -0,0 +1,40 @@
+# Build tfhe-ntt
+name: Cargo Build tfhe-ntt
+
+on:
+  pull_request:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  cargo-builds:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Run pcc checks
+        run: |
+          make pcc_ntt
+
+      - name: Build release
+        run: |
+          make build_ntt
+
+      - name: Build release no-std
+        run: |
+          make build_ntt_no_std
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -0,0 +1,71 @@
+# Test tfhe-fft
+name: Cargo Test tfhe-fft
+
+on:
+  pull_request:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  cargo-tests:
+    runs-on: ${{ matrix.runner_type }}
+    strategy:
+      matrix:
+        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Test debug
+        run: |
+          make test_fft
+
+      - name: Test serialization
+        run: make test_fft_serde
+
+      - name: Test no-std
+        run: |
+          make test_fft_no_std
+
+  cargo-tests-nightly:
+    runs-on: ${{ matrix.runner_type }}
+    strategy:
+      matrix:
+        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Test nightly
+        run: |
+          make test_fft_nightly
+
+      - name: Test no-std nightly
+        run: |
+          make test_fft_no_std_nightly
+
+  cargo-tests-node-js:
+    runs-on: "ubuntu-latest"
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Test node js
+        run: |
+          make install_node
+          make test_fft_node_js_ci
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -0,0 +1,54 @@
+# Test tfhe-ntt
+name: Cargo Test tfhe-ntt
+
+on:
+  pull_request:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  cargo-tests:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Test debug
+        run: make test_ntt
+
+      - name: Test no-std
+        run: make test_ntt_no_std
+
+  cargo-tests-nightly:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Test nightly
+        run: make test_ntt_nightly
+
+      - name: Test no-std nightly
+        run: make test_ntt_no_std_nightly
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -27,7 +27,7 @@ jobs:
          make lint_workflow

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ed00f72a3ca5b6eff8ad4d3ffdcacedb67a21db1 # v3.0.15
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@5d6ac37a4cef8b8df67f482a8e384987766f0213 # v3.0.17
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,19 +47,19 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          files_yaml: |
            tfhe:
              - tfhe/src/**
-            concrete_csprng:
-              - concrete-csprng/src/**
+            tfhe_csprng:
+              - tfhe-csprng/src/**

      - name: Generate Keys
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
@@ -83,7 +83,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238
+        uses: codecov/codecov-action@5c47607acb93fed5485fdbf7232e8a31425f672a
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238
+        uses: codecov/codecov-action@5c47607acb93fed5485fdbf7232e8a31425f672a
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -121,7 +121,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -65,7 +65,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (csprng-randomness-tests)
@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -40,7 +40,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,6 +44,9 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
@@ -64,7 +67,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +120,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -183,7 +186,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -33,7 +33,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -42,6 +42,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
@@ -62,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -115,7 +118,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -181,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -76,7 +76,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -139,7 +139,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -43,6 +43,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
@@ -64,7 +67,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +120,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -186,7 +189,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -110,7 +110,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -0,0 +1,188 @@
+# Signed integer GPU tests on an RTXA6000 VM on hyperstack with classical PBS
+name: TFHE Cuda Backend - Signed integer tests with classical PBS
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+      types: [ labeled ]
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml
+
+  setup-instance:
+    name: Setup instance (cuda-signed-classic-tests)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: gpu-test
+
+  cuda-tests-linux:
+    name: CUDA signed integer tests with classical PBS
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run signed integer tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-signed-classic-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,6 +44,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
@@ -65,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -143,10 +146,6 @@ jobs:
        if: ${{ !cancelled() }}
        run: nvidia-smi

-      - name: Run signed integer tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
-
      - name: Run signed integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
@@ -172,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -50,6 +50,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
@@ -71,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -125,7 +128,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -185,7 +188,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -0,0 +1,188 @@
+# Test unsigned integers on an RTXA6000 VM on hyperstack with the classical PBS
+name: TFHE Cuda Backend - Unsigned integer tests with classical PBS
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+      types: [ labeled ]
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**.md'
+              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml
+
+  setup-instance:
+    name: Setup instance (cuda-unsigned-classic-tests)
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: gpu-test
+
+  cuda-tests-linux:
+    name: CUDA unsigned integer tests with classical PBS
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run unsigned integer tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-unsigned-classic-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,12 +44,15 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
-              - '.github/workflows/gpu_unsigned_integer_tests.yml'
+              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml

@@ -65,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +119,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -143,10 +146,6 @@ jobs:
        if: ${{ !cancelled() }}
        run: nvidia-smi

-      - name: Run unsigned integer tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
-
      - name: Run unsigned integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
@@ -172,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -50,6 +50,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
@@ -71,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -122,7 +125,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -182,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -3,7 +3,7 @@ name: Tests on M1 CPU
 on:
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  # Have a nightly build for M1 tests
  schedule:
    # * is a special character in YAML so you have to quote this string
@@ -27,7 +27,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  cargo-builds:
+  cargo-builds-m1:
    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
    runs-on: ["self-hosted", "m1mac"]
    # 12 hours, default is 6 hours, hopefully this is more than enough
@@ -36,20 +36,57 @@ jobs:
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Run pcc FFT checks
+        run: |
+          make pcc_fft
+
+      - name: Build FFT release
+        run: |
+          make build_fft
+
+      - name: Build FFT release no-std
+        run: |
+          make build_fft_no_std
+
+      - name: Run FFT tests
+        run: |
+          make test_fft
+          make test_fft_serde
+          make test_fft_nightly
+          make test_fft_no_std
+          make test_fft_no_std_nightly
+          # we don't run the js stuff here as it's causing issues with the M1 config
+
+      - name: Run pcc NTT checks
+        run: |
+          make pcc_ntt
+
+      - name: Build NTT release
+        run: |
+          make build_ntt
+
+      - name: Build NTT release no-std
+        run: |
+          make build_ntt_no_std
+
+      - name: Run NTT tests
+        run: |
+          make test_ntt_all
+
      - name: Run pcc checks
        run: |
          make pcc

-      - name: Build concrete-csprng
+      - name: Build tfhe-csprng
        run: |
-          make build_concrete_csprng
+          make build_tfhe_csprng

      - name: Build Release core
        run: |
@@ -75,9 +112,9 @@ jobs:
        run: |
          make build_c_api

-      - name: Run concrete-csprng tests
+      - name: Run tfhe-csprng tests
        run: |
-          make test_concrete_csprng
+          make test_tfhe_csprng

      - name: Run tfhe-zk-pok tests
        run: |
@@ -137,7 +174,7 @@ jobs:
    name: Remove m1_test label
    runs-on: ubuntu-latest
    needs:
-      - cargo-builds
+      - cargo-builds-m1
    if: ${{ always() }}
    steps:
      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
@@ -147,13 +184,13 @@ jobs:
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: Slack Notification
-        if: ${{ needs.cargo-builds.result != 'skipped' }}
+        if: ${{ needs.cargo-builds-m1.result != 'skipped' }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
-          SLACK_COLOR: ${{ needs.cargo-builds.result }}
+          SLACK_COLOR: ${{ needs.cargo-builds-m1.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds-m1.result }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -1,4 +1,4 @@
-name: Publish concrete-csprng release
+name: Publish tfhe-csprng release

 on:
  workflow_dispatch:
@@ -19,7 +19,7 @@ jobs:
      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

  publish_release:
-    name: Publish concrete-csprng Release
+    name: Publish tfhe-csprng Release
    needs: verify_tag
    runs-on: ubuntu-latest
    steps:
@@ -33,7 +33,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p concrete-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          cargo publish -p tfhe-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -43,6 +43,6 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -36,7 +36,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -70,7 +70,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -119,7 +119,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -0,0 +1,49 @@
+# Publish new release of tfhe-fft
+name: Publish tfhe-fft release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  publish_release:
+    name: Publish tfhe-fft Release
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    steps:
+      - name: Checkout
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-fft --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-fft release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -0,0 +1,49 @@
+# Publish new release of tfhe-ntt
+name: Publish tfhe-ntt release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  publish_release:
+    name: Publish tfhe-ntt Release
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    steps:
+      - name: Checkout
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-ntt --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-ntt release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ target/

 # Some of our bench outputs
 /tfhe/benchmarks_parameters
+/tfhe-zk-pok/benchmarks_parameters
 **/*.csv

 # dieharder run log
@@ -28,6 +29,8 @@ backends/tfhe-cuda-backend/cuda/cmake-build-debug/
 tfhe/web_wasm_parallel_tests/server.PID
 venv/
 web-test-runner/
+node_modules/
+package-lock.json

 # Dir used for backward compatibility test data
 tfhe/tfhe-backward-compat-data/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,10 +2,12 @@
 resolver = "2"
 members = [
    "tfhe",
+    "tfhe-fft",
+    "tfhe-ntt",
    "tfhe-zk-pok",
    "tasks",
    "apps/trivium",
-    "concrete-csprng",
+    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
@@ -16,6 +18,14 @@ exclude = [
    "utils/cargo-tfhe-lints-inner",
    "utils/cargo-tfhe-lints"
 ]
+[workspace.dependencies]
+aligned-vec = { version = "0.5", default-features = false }
+bytemuck = "1.14.3"
+dyn-stack = { version = "0.10", default-features = false }
+num-complex = "0.4"
+pulp = { version = "0.18.22", default-features = false }
+serde = { version = "1.0", default-features = false }
+wasm-bindgen = ">=0.2.86,<0.2.94"

 [profile.bench]
 lto = "fat"
--- a/319
+++ b/319
@@ -18,13 +18,15 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 NIGHTLY_TESTS?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
+BENCH_TYPE?=latency
 NODE_VERSION=22.6
-FORWARD_COMPAT?=OFF
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=v0.3
+BACKWARD_COMPAT_DATA_BRANCH?=v0.4
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
+# We are kind of hacking the cut here, the version cannot contain a quote '"'
+WASM_BINDGEN_VERSION:=$(shell grep '^wasm-bindgen[[:space:]]*=' Cargo.toml | cut -d '"' -f 2 | xargs)
 WEB_RUNNER_DIR=web-test-runner
 WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
@@ -43,12 +45,6 @@ else
 		COVERAGE_ONLY=
 endif

-ifeq ($(FORWARD_COMPAT),ON)
-		FORWARD_COMPAT_FEATURE=forward_compatibility
-else
-		FORWARD_COMPAT_FEATURE=
-endif
-
 # Variables used only for regex_engine example
 REGEX_STRING?=''
 REGEX_PATTERN?=''
@@ -98,12 +94,26 @@ install_rs_build_toolchain:
 	( echo "Unable to install $(RS_BUILD_TOOLCHAIN) toolchain, check your rustup installation. \
 	Rustup can be downloaded at https://rustup.rs/" && exit 1 )

+.PHONY: install_build_wasm32_target # Install the wasm32 toolchain used for builds
+install_build_wasm32_target: install_rs_build_toolchain
+	rustup +$(RS_BUILD_TOOLCHAIN) target add wasm32-unknown-unknown || \
+	( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
+	Rustup can be downloaded at https://rustup.rs/" && exit 1 )
+
 .PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
 install_cargo_nextest: install_rs_build_toolchain
 	@cargo nextest --version > /dev/null 2>&1 || \
 	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-nextest --locked || \
 	( echo "Unable to install cargo nextest, unknown error." && exit 1 )

+# The installation should use the ^ symbol if the specified version in the root Cargo.toml is of the
+# form "0.2.96" then we get ^0.2.96 e.g., as we don't lock those dependencies
+# this allows to get the matching CLI
+# If a version range is specified no need to add the leading ^
+.PHONY: install_wasm_bindgen_cli # Install wasm-bindgen-cli to get access to the test runner
+install_wasm_bindgen_cli: install_rs_build_toolchain
+	cargo +$(RS_BUILD_TOOLCHAIN) install --locked wasm-bindgen-cli --version "$(WASM_BINDGEN_VERSION)"
+
 .PHONY: install_wasm_pack # Install wasm-pack to build JS packages
 install_wasm_pack: install_rs_build_toolchain
 	@wasm-pack --version > /dev/null 2>&1 || \
@@ -308,6 +318,9 @@ clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),zk-pok \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
@@ -323,6 +336,9 @@ clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),zk-pok,shortint \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
@@ -347,7 +363,7 @@ clippy_rustdoc: install_rs_check_toolchain
 	fi && \
 	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats,strings \
 		-p $(TFHE_SPEC)

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -378,17 +394,17 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

-.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
-clippy_concrete_csprng: install_rs_check_toolchain
+.PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
+clippy_tfhe_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE) \
-		-p concrete-csprng -- --no-deps -D warnings
+		-p tfhe-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
 clippy_zk_pok: install_rs_check_toolchain
@@ -404,12 +420,12 @@ clippy_versionable: install_rs_check_toolchain

 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
 clippy_versionable

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
-clippy_core clippy_concrete_csprng
+clippy_core clippy_tfhe_csprng

 .PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
 clippy_cuda_backend: install_rs_check_toolchain
@@ -475,7 +491,7 @@ build_tfhe_coverage: install_rs_build_toolchain
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok \
 		-p $(TFHE_SPEC)

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
@@ -487,7 +503,7 @@ build_c_api_gpu: install_rs_check_toolchain
 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
 		-p $(TFHE_SPEC)

 .PHONY: build_web_js_api # Build the js API targeting the web browser
@@ -515,10 +531,10 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
 		wasm-pack build --release --target=nodejs \
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok

-.PHONY: build_concrete_csprng # Build concrete_csprng
-build_concrete_csprng: install_rs_build_toolchain
+.PHONY: build_tfhe_csprng # Build tfhe_csprng
+build_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng --all-targets
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng --all-targets

 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
@@ -749,10 +765,15 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

-.PHONY: test_safe_deserialization # Run the tests for safe deserialization
-test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
+.PHONY: test_safe_serialization # Run the tests for safe serialization
+test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_deserialization::
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::
+
+.PHONY: test_zk # Run the tests for the zk module of the TFHE-rs crate
+test_zk: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),shortint,zk-pok -p $(TFHE_SPEC) -- zk::

 .PHONY: test_integer # Run all the tests for integer
 test_integer: install_rs_build_toolchain
@@ -779,6 +800,13 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

+.PHONY: test_strings # Run the tests for strings ci
+test_strings: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer,strings -p $(TFHE_SPEC) \
+		-- strings::
+
+
 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
@@ -792,11 +820,7 @@ test_user_doc_gpu: install_rs_build_toolchain
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
 		-- test_user_docs::

-.PHONY: test_fhe_strings # Run tests for fhe_strings example
-test_fhe_strings: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example fhe_strings \
-		--features=$(TARGET_ARCH_FEATURE),integer
+

 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
@@ -823,16 +847,29 @@ test_kreyvium: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-trivium -- --test-threads=1 kreyvium::

-.PHONY: test_concrete_csprng # Run concrete-csprng tests
-test_concrete_csprng: install_rs_build_toolchain
+.PHONY: test_tfhe_csprng # Run tfhe-csprng tests
+test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng

 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok

+.PHONY: test_zk_wasm_x86_compat_ci
+test_zk_wasm_x86_compat_ci: check_nvm_installed
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) test_zk_wasm_x86_compat
+
+.PHONY: test_zk_wasm_x86_compat # Check compatibility between wasm and x86_64 proofs
+test_zk_wasm_x86_compat: install_rs_build_toolchain build_node_js_api
+	cd tfhe/tests/zk_wasm_x86_test && npm install
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		-p tfhe --test zk_wasm_x86_test --features=$(TARGET_ARCH_FEATURE),integer,zk-pok
+
 .PHONY: test_versionable # Run tests for tfhe-versionable subcrate
 test_versionable: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -979,7 +1016,7 @@ no_dbg_log:
 	@./scripts/no_dbg_calls.sh

 .PHONY: dieharder_csprng # Run the dieharder test suite on our CSPRNG implementation
-dieharder_csprng: install_dieharder build_concrete_csprng
+dieharder_csprng: install_dieharder build_tfhe_csprng
 	./scripts/dieharder_test.sh

 #
@@ -993,40 +1030,42 @@ print_doc_bench_parameters:

 .PHONY: bench_integer # Run benchmarks for unsigned integer
 bench_integer: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
@@ -1034,7 +1073,7 @@ bench_integer_multi_bit: install_rs_check_toolchain

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
@@ -1042,23 +1081,23 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
+	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
+	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
@@ -1080,7 +1119,7 @@ bench_shortint_oprf: install_rs_check_toolchain

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
@@ -1164,6 +1203,11 @@ bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	--bench hlapi-erc20 \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --

+.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
+bench_tfhe_zk_pok: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
+
 #
 # Utility tools
 #
@@ -1262,6 +1306,189 @@ check_compile_tests
 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt

+#=============================== FFT Section ==================================
+.PHONY: doc_fft # Build rust doc for tfhe-fft
+doc_fft: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
+	RUSTDOCFLAGS="--html-in-header katex-header.html" \
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
+		--all-features --no-deps -p tfhe-fft
+
+.PHONY: docs_fft # Build rust doc tfhe-fft, alias for doc
+docs_fft: doc_fft
+
+.PHONY: lint_doc_fft # Build rust doc for tfhe-fft with linting enabled
+lint_doc_fft: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
+	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
+		--all-features --no-deps -p tfhe-fft
+
+.PHONY: lint_docs_fft # Build rust doc for tfhe-fft with linting enabled, alias for lint_doc
+lint_docs_fft: lint_doc_fft
+
+.PHONY: clippy_fft # Run clippy lints on tfhe-fft
+clippy_fft: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--all-features -p tfhe-fft -- --no-deps -D warnings
+
+.PHONY: pcc_fft # pcc stands for pre commit checks
+pcc_fft: check_fmt lint_doc_fft clippy_fft
+
+.PHONY: build_fft
+build_fft: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
+		--features=fft128
+
+.PHONY: build_fft_no_std
+build_fft_no_std: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
+		--no-default-features
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
+		--no-default-features \
+		--features=fft128
+
+##### Tests #####
+
+.PHONY: test_fft
+test_fft: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
+		--features=fft128
+
+.PHONY: test_fft_serde
+test_fft_serde: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
+		--features=serde
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
+		--features=serde,fft128
+
+.PHONY: test_fft_nightly
+test_fft_nightly: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
+		--features=nightly
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
+		--features=nightly,fft128
+
+.PHONY: test_fft_no_std
+test_fft_no_std: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
+		--no-default-features 
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
+		--no-default-features \
+		--features=fft128
+
+.PHONY: test_fft_no_std_nightly
+test_fft_no_std_nightly: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
+		--no-default-features \
+		--features=nightly
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
+		--no-default-features \
+		--features=nightly,fft128
+
+.PHONY: test_fft_node_js
+test_fft_node_js: install_rs_build_toolchain install_build_wasm32_target install_wasm_bindgen_cli
+	RUSTFLAGS="" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release \
+		--features=serde --target wasm32-unknown-unknown -p tfhe-fft
+
+.PHONY: test_fft_node_js_ci
+test_fft_node_js_ci: check_nvm_installed
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	"$(MAKE)" test_fft_node_js
+
+.PHONY: test_fft_all
+test_fft_all: test_fft test_fft_serde test_fft_nightly test_fft_no_std test_fft_no_std_nightly \
+test_fft_node_js_ci
+
+##### Bench #####
+
+.PHONY: bench_fft # Run FFT benchmarks
+bench_fft: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" bench --bench fft -p tfhe-fft \
+		--features=serde \
+		--features=nightly \
+		--features=fft128
+#============================End FFT Section ==================================
+
+#=============================== NTT Section ==================================
+.PHONY: doc_ntt # Build rust doc for tfhe-ntt
+doc_ntt: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
+	RUSTDOCFLAGS="--html-in-header katex-header.html" \
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
+		--all-features --no-deps -p tfhe-ntt
+
+.PHONY: docs_ntt # Build rust doc tfhe-ntt, alias for doc
+docs_ntt: doc_ntt
+
+.PHONY: lint_doc_ntt # Build rust doc for tfhe-ntt with linting enabled
+lint_doc_ntt: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
+	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
+		--all-features --no-deps -p tfhe-ntt
+
+.PHONY: lint_docs_ntt # Build rust doc for tfhe-ntt with linting enabled, alias for lint_doc
+lint_docs_ntt: lint_doc_ntt
+
+.PHONY: clippy_ntt # Run clippy lints on tfhe-ntt
+clippy_ntt: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--all-features -p tfhe-ntt -- --no-deps -D warnings
+
+.PHONY: pcc_ntt # pcc stands for pre commit checks
+pcc_ntt: check_fmt lint_doc_ntt clippy_ntt
+
+.PHONY: build_ntt
+build_ntt: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-ntt
+
+.PHONY: build_ntt_no_std
+build_ntt_no_std: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-ntt \
+		--no-default-features
+
+##### Tests #####
+
+.PHONY: test_ntt
+test_ntt: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt
+
+.PHONY: test_ntt_nightly
+test_ntt_nightly: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-ntt \
+		--features=nightly
+
+.PHONY: test_ntt_no_std
+test_ntt_no_std: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt \
+		--no-default-features 
+
+.PHONY: test_ntt_no_std_nightly
+test_ntt_no_std_nightly: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-ntt \
+		--no-default-features \
+		--features=nightly
+
+.PHONY: test_ntt_all
+test_ntt_all: test_ntt test_ntt_no_std test_ntt_nightly test_ntt_no_std_nightly
+
+##### Bench #####
+
+.PHONY: bench_ntt # Run NTT benchmarks
+bench_ntt: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" bench --bench ntt -p tfhe-ntt \
+		--features=nightly
+#============================End NTT Section ==================================
+
 .PHONY: help # Generate list of targets with descriptions
 help:
 	@grep '^\.PHONY: .* #' Makefile | sed 's/\.PHONY: \(.*\) # \(.*\)/\1\t\2/' | expand -t30 | sort
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"]
 ```

 > [!Note]
-> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
+> Note: You need to use a Rust version >= 1.81 to compile TFHE-rs.

 > [!Note]
 > Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.5.0"
+version = "0.6.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -62,6 +62,7 @@ fn main() {
            "cuda/include/integer/integer.h",
            "cuda/include/keyswitch.h",
            "cuda/include/linear_algebra.h",
+            "cuda/include/pbs/fft.h",
            "cuda/include/pbs/programmable_bootstrap.h",
            "cuda/include/pbs/programmable_bootstrap_multibit.h",
        ];
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -83,7 +83,7 @@ endif()
 set(CMAKE_CUDA_FLAGS
    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
-  --use_fast_math -Xcompiler -fPIC")
+  --use_fast_math -Xcompiler -fPIC --ptxas-options=-v")

 set(INCLUDE_DIR include)

--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -39,7 +39,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

-void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -103,17 +103,19 @@ void cleanup_cuda_full_propagation(void *const *streams,

 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory);

 void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left,
-    void const *radix_lwe_right, void *const *bsks, void *const *ksks,
-    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
+    void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
+    void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
+    void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks);

 void cleanup_cuda_integer_mult(void *const *streams,
                               uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -297,47 +299,6 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void);

-void scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory);
-
-void cuda_fast_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks,
-    uint32_t requested_flag, uint32_t uses_carry);
-
-void cleanup_cuda_fast_propagate_single_carry(void *const *streams,
-                                              uint32_t const *gpu_indexes,
-                                              uint32_t gpu_count,
-                                              int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
-    bool allocate_gpu_memory);
-
-void cuda_integer_overflowing_sub_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs_array, const void *rhs_array, void *overflow_block,
-    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
-    uint32_t uses_input_borrow);
-
-void cleanup_cuda_integer_overflowing_sub(void *const *streams,
-                                          uint32_t const *gpu_indexes,
-                                          uint32_t gpu_count,
-                                          int8_t **mem_ptr_void);
-
 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -398,16 +359,17 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,

 void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory);

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    uint32_t num_blocks_in_radix);

 void cleanup_cuda_integer_div_rem(void *const *streams,
@@ -458,5 +420,24 @@ void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                            uint32_t num_blocks,
                                            uint32_t lwe_size);

+void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+
+void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);
+
+void cleanup_cuda_integer_abs_inplace(void *const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count,
+                                      int8_t **mem_ptr_void);
+
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -27,12 +27,6 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       void const *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_64_with_packing(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in_1, void const *lwe_array_in_2,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus);
-
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/fft.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/fft.h
@@ -0,0 +1,17 @@
+#include <stdint.h>
+extern "C" {
+void cuda_fourier_transform_forward_as_torus_f128_async(
+    void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
+    void *im1, void const *standard, uint32_t const N,
+    const uint32_t number_of_samples);
+
+void cuda_fourier_transform_forward_as_integer_f128_async(
+    void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
+    void *im1, void const *standard, uint32_t const N,
+    const uint32_t number_of_samples);
+
+void cuda_fourier_transform_backward_as_torus_f128_async(
+    void *stream, uint32_t gpu_index, void *standard, void const *re0,
+    void const *re1, void const *im0, void const *im1, uint32_t const N,
+    const uint32_t number_of_samples);
+}
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -106,7 +106,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  uint32_t lwe_chunk_size;
  double2 *keybundle_fft;
  Torus *global_accumulator;
-  double2 *global_accumulator_fft;
+  double2 *global_join_buffer;

  PBS_VARIANT pbs_variant;

@@ -225,10 +225,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
          stream, gpu_index);
      global_accumulator = (Torus *)cuda_malloc_async(
-          num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
-          gpu_index);
-      global_accumulator_fft = (double2 *)cuda_malloc_async(
-          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
+          input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+              sizeof(Torus),
+          stream, gpu_index);
+      global_join_buffer = (double2 *)cuda_malloc_async(
+          level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+              (polynomial_size / 2) * sizeof(double2),
          stream, gpu_index);
    }
  }
@@ -260,7 +262,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {

    cuda_drop_async(keybundle_fft, stream, gpu_index);
    cuda_drop_async(global_accumulator, stream, gpu_index);
-    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -69,7 +69,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  Torus *global_accumulator;
-  double2 *global_accumulator_fft;
+  double2 *global_join_buffer;

  PBS_VARIANT pbs_variant;

@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        // Otherwise, both kernels run all in shared memory
        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                (polynomial_size / 2) * sizeof(double2),
            stream, gpu_index);
@@ -147,7 +147,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        // Otherwise, both kernels run all in shared memory
        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                polynomial_size / 2 * sizeof(double2),
            stream, gpu_index);
@@ -194,7 +194,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        // Otherwise, both kernels run all in shared memory
        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                polynomial_size / 2 * sizeof(double2),
            stream, gpu_index);
@@ -208,7 +208,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

  void release(cudaStream_t stream, uint32_t gpu_index) {
    cuda_drop_async(d_mem, stream, gpu_index);
-    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);

    if (pbs_variant == DEFAULT)
      cuda_drop_async(global_accumulator, stream, gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_CRYPTO_CUH
 #define CNCRT_CRPYTO_CUH

+#include "crypto/torus.cuh"
 #include "device.h"
 #include <cstdint>

@@ -21,7 +22,6 @@ private:
  uint32_t base_log;
  uint32_t mask;
  uint32_t num_poly;
-  int current_level;
  T mask_mod_b;
  T *state;

@@ -32,13 +32,6 @@ public:
        state(state) {

    mask_mod_b = (1ll << base_log) - 1ll;
-    current_level = level_count;
-    int tid = threadIdx.x;
-    for (int i = 0; i < num_poly * params::opt; i++) {
-      state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
  }

  // Decomposes all polynomials at once
@@ -52,28 +45,30 @@ public:
  // Decomposes a single polynomial
  __device__ void decompose_and_compress_next_polynomial(double2 *result,
                                                         int j) {
-    if (j == 0)
-      current_level -= 1;
-
-    int tid = threadIdx.x;
-    auto state_slice = state + j * params::degree;
+    uint32_t tid = threadIdx.x;
+    auto state_slice = &state[j * params::degree];
    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state_slice[tid] & mask_mod_b;
-      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
-      state_slice[tid] >>= base_log;
-      state_slice[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
-      T carry_im =
-          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
+      auto input1 = &state_slice[tid];
+      auto input2 = &state_slice[tid + params::degree / 2];
+      T res_re = *input1 & mask_mod_b;
+      T res_im = *input2 & mask_mod_b;
+
+      *input1 >>= base_log; // Update state
+      *input2 >>= base_log; // Update state
+
+      T carry_re = ((res_re - 1ll) | *input1) & res_re;
+      T carry_im = ((res_im - 1ll) | *input2) & res_im;
      carry_re >>= (base_log - 1);
      carry_im >>= (base_log - 1);
-      state_slice[tid] += carry_re;
-      state_slice[tid + params::degree / 2] += carry_im;
+
+      *input1 += carry_re; // Update state
+      *input2 += carry_im; // Update state
+
      res_re -= carry_re << base_log;
      res_im -= carry_im << base_log;

-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
+      typecast_torus_to_double(res_re, result[tid].x);
+      typecast_torus_to_double(res_im, result[tid].y);

      tid += params::degree / params::opt;
    }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -71,12 +71,10 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,

    // This loop distribution seems to benefit the global mem reads
    for (int i = start_i; i < end_i; i++) {
-      Torus a_i = round_to_closest_multiple(block_lwe_array_in[i], base_log,
-                                            level_count);
-      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+      Torus state =
+          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);

-      for (int j = level_count - 1; j >= 0; j--) {
-        // Levels are stored in reverse order
+      for (int j = 0; j < level_count; j++) {
        auto ksk_block =
            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -202,15 +200,13 @@ __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
    // Iterate through all lwe elements
    for (int i = 0; i < lwe_dimension_in; i++) {
      // Round and prepare decomposition
-      Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
+      Torus state = init_decomposer_state(lwe_in[i], base_log, level_count);

-      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
      Torus mod_b_mask = (1ll << base_log) - 1ll;

      // block of key for current lwe coefficient (cur_input_lwe[i])
      auto ksk_block = &fp_ksk[i * ksk_block_size];
-      for (int j = level_count - 1; j >= 0; j--) {
-        // Levels are stored in reverse order
+      for (int j = 0; j < level_count; j++) {
        auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
        // Iterate through each level and multiply by the ksk piece
        auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_TORUS_CUH
 #define CNCRT_TORUS_CUH

+#include "device.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
 #include "utils/kernel_dimensions.cuh"
@@ -11,6 +12,11 @@ __host__ __device__ __forceinline__ constexpr double get_two_pow_torus_bits() {
  return (sizeof(T) == 4) ? 4294967296.0 : 18446744073709551616.0;
 }

+template <typename T>
+__host__ __device__ __forceinline__ constexpr T scalar_max() {
+  return std::numeric_limits<T>::max();
+}
+
 template <typename T>
 __device__ inline void typecast_double_to_torus(double x, T &r) {
  r = T(x);
@@ -44,14 +50,36 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) {
 }

 template <typename T>
-__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
-                                              uint32_t level_count) {
-  const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
-  const T shift = non_rep_bit_count - 1;
-  T res = x >> shift;
-  res += 1;
-  res &= (T)(-2);
-  return res << shift;
+__device__ inline void typecast_torus_to_double(T x, double &r);
+
+template <>
+__device__ inline void typecast_torus_to_double<uint32_t>(uint32_t x,
+                                                          double &r) {
+  r = __int2double_rn(x);
+}
+
+template <>
+__device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
+                                                          double &r) {
+  r = __ll2double_rn(x);
+}
+
+template <typename T>
+__device__ inline T init_decomposer_state(T input, uint32_t base_log,
+                                          uint32_t level_count) {
+  const T rep_bit_count = level_count * base_log;
+  const T non_rep_bit_count = sizeof(T) * 8 - rep_bit_count;
+  T res = input >> (non_rep_bit_count - 1);
+  T rounding_bit = res & (T)(1);
+  res++;
+  res >>= 1;
+  T torus_max = scalar_max<T>();
+  T mod_mask = torus_max >> non_rep_bit_count;
+  res &= mod_mask;
+  T shifted_random = rounding_bit << (rep_bit_count - 1);
+  T need_balance =
+      (((res - (T)(1)) | shifted_random) & res) >> (rep_bit_count - 1);
+  return res - (need_balance << rep_bit_count);
 }

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -97,7 +97,7 @@ bool cuda_check_support_thread_block_clusters() {
 }

 /// Copy memory to the GPU asynchronously
-void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
@@ -268,17 +268,20 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
 /// Get the maximum size for the shared memory
 int cuda_get_max_shared_memory(uint32_t gpu_index) {
  int max_shared_memory = 0;
-  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
-                         gpu_index);
-  check_cuda_error(cudaGetLastError());
 #if CUDA_ARCH == 900
  max_shared_memory = 226000;
 #elif CUDA_ARCH == 890
  max_shared_memory = 100000;
+#elif CUDA_ARCH == 860
+  max_shared_memory = 100000;
 #elif CUDA_ARCH == 800
  max_shared_memory = 163000;
 #elif CUDA_ARCH == 700
  max_shared_memory = 95000;
+#else
+  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
+                         gpu_index);
+  check_cuda_error(cudaGetLastError());
 #endif
  return max_shared_memory;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -0,0 +1,370 @@
+
+#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
+#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
+
+#include <cstdint>
+#include <cstring>
+
+struct alignas(16) f128 {
+  double hi;
+  double lo;
+
+  // Default and parameterized constructors
+  __host__ __device__ f128() : hi(0.0), lo(0.0) {}
+  __host__ __device__ f128(double high, double low) : hi(high), lo(low) {}
+
+  // Quick two-sum
+  __host__ __device__ __forceinline__ static f128 quick_two_sum(double a,
+                                                                double b) {
+#ifdef __CUDA_ARCH__
+    double s = __dadd_rn(a, b);
+    return f128(s, __dsub_rn(b, __dsub_rn(s, a)));
+#else
+    double s = a + b;
+    return f128(s, b - (s - a));
+#endif;
+  }
+
+  // Two-sum
+  __host__ __device__ __forceinline__ static f128 two_sum(double a, double b) {
+#ifdef __CUDA_ARCH__
+    double s = __dadd_rn(a, b);
+    double bb = __dsub_rn(s, a);
+    return f128(s, __dadd_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dsub_rn(b, bb)));
+#else
+    double s = a + b;
+    double bb = s - a;
+    return f128(s, (a - (s - bb)) + (b - bb));
+#endif
+  }
+
+  // Two-product
+  __host__ __device__ __forceinline__ static f128 two_prod(double a, double b) {
+
+#ifdef __CUDA_ARCH__
+    double p = __dmul_rn(a, b);
+    double p2 = __fma_rn(a, b, -p);
+#else
+    double p = a * b;
+    double p2 = fma(a, b, -p);
+#endif
+    return f128(p, p2);
+  }
+
+  __host__ __device__ __forceinline__ static f128 two_diff(double a, double b) {
+#ifdef __CUDA_ARCH__
+    double s = __dsub_rn(a, b);
+    double bb = __dsub_rn(s, a);
+    return f128(s, __dsub_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dadd_rn(b, bb)));
+#else
+    double s = a - b;
+    double bb = s - a;
+    return f128(s, (a - (s - bb)) - (b + bb));
+#endif
+  }
+
+  // Addition
+  __host__ __device__ static f128 add(const f128 &a, const f128 &b) {
+    auto s = two_sum(a.hi, b.hi);
+    auto t = two_sum(a.lo, b.lo);
+
+    double hi = s.hi;
+    double lo = s.lo + t.hi;
+    hi = hi + lo;
+    lo = lo - (hi - s.hi);
+
+    return f128(hi, lo + t.lo);
+  }
+
+  // Addition with estimate
+  __host__ __device__ static f128 add_estimate(const f128 &a, const f128 &b) {
+    auto se = two_sum(a.hi, b.hi);
+#ifdef __CUDA_ARCH__
+    se.lo = __dadd_rn(se.lo, __dadd_rn(a.lo, b.lo));
+#else
+    se.lo += (a.lo + b.lo);
+#endif
+
+    return quick_two_sum(se.hi, se.lo);
+  }
+
+  // Subtraction with estimate
+  __host__ __device__ static f128 sub_estimate(const f128 &a, const f128 &b) {
+    f128 se = two_diff(a.hi, b.hi);
+#ifdef __CUDA_ARCH__
+    se.lo = __dadd_rn(se.lo, a.lo);
+    se.lo = __dsub_rn(se.lo, b.lo);
+#else
+    se.lo += a.lo;
+    se.lo -= b.lo;
+#endif
+    return quick_two_sum(se.hi, se.lo);
+  }
+
+  // Subtraction
+  __host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
+    auto s = two_diff(a.hi, b.hi);
+    auto t = two_diff(a.lo, b.lo);
+    s = quick_two_sum(s.hi, s.lo + t.hi);
+    return quick_two_sum(s.hi, s.lo + t.lo);
+  }
+
+  // Multiplication
+  __host__ __device__ static f128 mul(const f128 &a, const f128 &b) {
+    auto p = two_prod(a.hi, b.hi);
+#ifdef __CUDA_ARCH__
+    double a_0_x_b_1 = __dmul_rn(a.hi, b.lo);
+    double a_1_x_b_0 = __dmul_rn(a.lo, b.hi);
+    p.lo = __dadd_rn(p.lo, __dadd_rn(a_0_x_b_1, a_1_x_b_0));
+#else
+    p.lo += (a.hi * b.lo + a.lo * b.hi);
+#endif
+    p = quick_two_sum(p.hi, p.lo);
+    return p;
+  }
+
+  __host__ __device__ static void
+  cplx_f128_mul_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
+                       const f128 &a_im, const f128 &b_re, const f128 &b_im) {
+    auto a_re_x_b_re = mul(a_re, b_re);
+    auto a_re_x_b_im = mul(a_re, b_im);
+    auto a_im_x_b_re = mul(a_im, b_re);
+    auto a_im_x_b_im = mul(a_im, b_im);
+
+    c_re = sub_estimate(a_re_x_b_re, a_im_x_b_im);
+    c_im = add_estimate(a_im_x_b_re, a_re_x_b_im);
+  }
+
+  __host__ __device__ static void
+  cplx_f128_sub_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
+                       const f128 &a_im, const f128 &b_re, const f128 &b_im) {
+    c_re = sub_estimate(a_re, b_re);
+    c_im = sub_estimate(a_im, b_im);
+  }
+  __host__ __device__ static void
+  cplx_f128_add_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
+                       const f128 &a_im, const f128 &b_re, const f128 &b_im) {
+    c_re = add_estimate(a_re, b_re);
+    c_im = add_estimate(a_im, b_im);
+  }
+};
+
+struct f128x2 {
+  f128 re;
+  f128 im;
+
+  __host__ __device__ f128x2() : re(), im() {}
+
+  __host__ __device__ f128x2(const f128 &real, const f128 &imag)
+      : re(real), im(imag) {}
+
+  __host__ __device__ f128x2(double real, double imag)
+      : re(real, 0.0), im(imag, 0.0) {}
+
+  __host__ __device__ explicit f128x2(double real)
+      : re(real, 0.0), im(0.0, 0.0) {}
+
+  __host__ __device__ f128x2(const f128x2 &other)
+      : re(other.re), im(other.im) {}
+
+  __host__ __device__ f128x2(f128x2 &&other) noexcept
+      : re(std::move(other.re)), im(std::move(other.im)) {}
+
+  __host__ __device__ f128x2 &operator=(const f128x2 &other) {
+    if (this != &other) {
+      re = other.re;
+      im = other.im;
+    }
+    return *this;
+  }
+
+  __host__ __device__ f128x2 &operator=(f128x2 &&other) noexcept {
+    if (this != &other) {
+      re = std::move(other.re);
+      im = std::move(other.im);
+    }
+    return *this;
+  }
+
+  __host__ __device__ f128x2 conjugate() const {
+    return f128x2(re, f128(-im.hi, -im.lo));
+  }
+
+  __host__ __device__ f128 norm_squared() const {
+    return f128::add(f128::mul(re, re), f128::mul(im, im));
+  }
+
+  __host__ __device__ void zero() {
+    re = f128(0.0, 0.0);
+    im = f128(0.0, 0.0);
+  }
+
+  // Addition
+  __host__ __device__ friend f128x2 operator+(const f128x2 &a,
+                                              const f128x2 &b) {
+    return f128x2(f128::add(a.re, b.re), f128::add(a.im, b.im));
+  }
+
+  // Subtraction
+  __host__ __device__ friend f128x2 operator-(const f128x2 &a,
+                                              const f128x2 &b) {
+    return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
+                  f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
+  }
+
+  // Multiplication (complex multiplication)
+  __host__ __device__ friend f128x2 operator*(const f128x2 &a,
+                                              const f128x2 &b) {
+    f128 real_part =
+        f128::add(f128::mul(a.re, b.re),
+                  f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
+    f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
+    return f128x2(real_part, imag_part);
+  }
+
+  // Addition-assignment operator
+  __host__ __device__ f128x2 &operator+=(const f128x2 &other) {
+    re = f128::add(re, other.re);
+    im = f128::add(im, other.im);
+    return *this;
+  }
+
+  // Subtraction-assignment operator
+  __host__ __device__ f128x2 &operator-=(const f128x2 &other) {
+    re = f128::add(re, f128(-other.re.hi, -other.re.lo));
+    im = f128::add(im, f128(-other.im.hi, -other.im.lo));
+    return *this;
+  }
+
+  // Multiplication-assignment operator
+  __host__ __device__ f128x2 &operator*=(const f128x2 &other) {
+    f128 new_re =
+        f128::add(f128::mul(re, other.re), f128(-f128::mul(im, other.im).hi,
+                                                -f128::mul(im, other.im).lo));
+    f128 new_im = f128::add(f128::mul(re, other.im), f128::mul(im, other.re));
+    re = new_re;
+    im = new_im;
+    return *this;
+  }
+};
+
+__host__ __device__ inline uint64_t double_to_bits(double d) {
+  uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
+  return bits;
+}
+
+__host__ __device__ inline double bits_to_double(uint64_t bits) {
+  double d = *reinterpret_cast<double *>(&bits);
+  return d;
+}
+
+__host__ __device__ double u128_to_f64(__uint128_t x) {
+  const __uint128_t ONE = 1;
+  const double A = ONE << 52;
+  const double B = ONE << 104;
+  const double C = ONE << 76;
+  const double D = 340282366920938500000000000000000000000.;
+
+  const __uint128_t threshold = (ONE << 104);
+
+  if (x < threshold) {
+    uint64_t A_bits = double_to_bits(A);
+
+    __uint128_t shifted = (x << 12);
+    uint64_t lower64 = static_cast<uint64_t>(shifted);
+    lower64 >>= 12;
+
+    uint64_t bits_l = A_bits | lower64;
+    double l_temp = bits_to_double(bits_l);
+    double l = l_temp - A;
+
+    uint64_t B_bits = double_to_bits(B);
+    uint64_t top64 = static_cast<uint64_t>(x >> 52);
+    uint64_t bits_h = B_bits | top64;
+    double h_temp = bits_to_double(bits_h);
+    double h = h_temp - B;
+
+    return (l + h);
+
+  } else {
+    uint64_t C_bits = double_to_bits(C);
+
+    __uint128_t shifted = (x >> 12);
+    uint64_t lower64 = static_cast<uint64_t>(shifted);
+    lower64 >>= 12;
+
+    uint64_t x_lo = static_cast<uint64_t>(x);
+    uint64_t mask_part = (x_lo & 0xFFFFFFULL);
+
+    uint64_t bits_l = C_bits | lower64 | mask_part;
+    double l_temp = bits_to_double(bits_l);
+    double l = l_temp - C;
+
+    uint64_t D_bits = double_to_bits(D);
+    uint64_t top64 = static_cast<uint64_t>(x >> 76);
+    uint64_t bits_h = D_bits | top64;
+    double h_temp = bits_to_double(bits_h);
+    double h = h_temp - D;
+
+    return (l + h);
+  }
+}
+
+__host__ __device__ __uint128_t f64_to_u128(const double f) {
+  const __uint128_t ONE = 1;
+  const uint64_t f_bits = double_to_bits(f);
+  if (f_bits < 1023ull << 52) {
+    return 0;
+  } else {
+    const __uint128_t m = ONE << 127 | (__uint128_t)f_bits << 75;
+    const uint64_t s = 1150 - (f_bits >> 52);
+    if (s >= 128) {
+      return 0;
+    } else {
+      return m >> s;
+    }
+  }
+}
+
+__host__ __device__ double i128_to_f64(__int128_t const x) {
+  uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
+  __uint128_t abs =
+      (x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
+
+  return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
+}
+__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
+  const double first_approx = i128_to_f64(x);
+  const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
+  const __uint128_t first_approx_roundtrip =
+      f64_to_u128((first_approx < 0) ? -first_approx : first_approx);
+  const __uint128_t first_approx_roundtrip_signed =
+      (sign_bit == (1ull << 63)) ? -first_approx_roundtrip
+                                 : first_approx_roundtrip;
+
+  double correction = i128_to_f64(x - first_approx_roundtrip_signed);
+
+  return f128(first_approx, correction);
+};
+
+#include <algorithm>
+#include <string>
+// Convert __uint128_t to decimal string
+std::string to_string_128(__uint128_t value) {
+  if (value == 0)
+    return "0";
+
+  std::string result;
+  // Repeatedly divide by 10 and build the number in reverse
+  while (value > 0) {
+    unsigned digit = static_cast<unsigned>(value % 10);
+    result.push_back(static_cast<char>('0' + digit));
+    value /= 10;
+  }
+
+  // The digits are in reverse order, so reverse them
+  std::reverse(result.begin(), result.end());
+  return result;
+} // TIP To <b>Run</b> code, press <shortcut actionId="Run"/> or
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cu
@@ -0,0 +1,163 @@
+#include "fft128.cuh"
+
+void cuda_fourier_transform_forward_as_integer_f128_async(
+    void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
+    void *im1, void const *standard, const uint32_t N,
+    const uint32_t number_of_samples) {
+  switch (N) {
+  case 64:
+    host_fourier_transform_forward_as_integer_f128<Degree<64>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 128:
+    host_fourier_transform_forward_as_integer_f128<Degree<128>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 256:
+    host_fourier_transform_forward_as_integer_f128<Degree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 512:
+    host_fourier_transform_forward_as_integer_f128<Degree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 1024:
+    host_fourier_transform_forward_as_integer_f128<Degree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 2048:
+    host_fourier_transform_forward_as_integer_f128<Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 4096:
+    host_fourier_transform_forward_as_integer_f128<Degree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  default:
+    PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+void cuda_fourier_transform_forward_as_torus_f128_async(
+    void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
+    void *im1, void const *standard, const uint32_t N,
+    const uint32_t number_of_samples) {
+  switch (N) {
+  case 64:
+    host_fourier_transform_forward_as_torus_f128<Degree<64>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 128:
+    host_fourier_transform_forward_as_torus_f128<Degree<128>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 256:
+    host_fourier_transform_forward_as_torus_f128<Degree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 512:
+    host_fourier_transform_forward_as_torus_f128<Degree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 1024:
+    host_fourier_transform_forward_as_torus_f128<Degree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 2048:
+    host_fourier_transform_forward_as_torus_f128<Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  case 4096:
+    host_fourier_transform_forward_as_torus_f128<Degree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
+        (double *)re1, (double *)im0, (double *)im1,
+        (__uint128_t const *)standard, N, number_of_samples);
+    break;
+  default:
+    PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+void cuda_fourier_transform_backward_as_torus_f128_async(
+    void *stream, uint32_t gpu_index, void *standard, void const *re0,
+    void const *re1, void const *im0, void const *im1, const uint32_t N,
+    const uint32_t number_of_samples) {
+  switch (N) {
+  case 64:
+    host_fourier_transform_backward_as_torus_f128<Degree<64>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  case 128:
+    host_fourier_transform_backward_as_torus_f128<Degree<128>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  case 256:
+    host_fourier_transform_backward_as_torus_f128<Degree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  case 512:
+    host_fourier_transform_backward_as_torus_f128<Degree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  case 1024:
+    host_fourier_transform_backward_as_torus_f128<Degree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  case 2048:
+    host_fourier_transform_backward_as_torus_f128<Degree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  case 4096:
+    host_fourier_transform_backward_as_torus_f128<Degree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
+        (double const *)re0, (double const *)re1, (double const *)im0,
+        (double const *)im1, N, number_of_samples);
+    break;
+  default:
+    PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -0,0 +1,760 @@
+#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
+#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
+
+#include "f128.cuh"
+#include "pbs/fft.h"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "twiddles.cuh"
+#include "types/complex/operations.cuh"
+#include <iostream>
+
+using Index = unsigned;
+
+#define NEG_TWID(i)                                                            \
+  f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]),               \
+         f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
+
+#define F64x4_TO_F128x2(f128x2_reg, ind)                                       \
+  f128x2_reg.re.hi = dt_re_hi[ind];                                            \
+  f128x2_reg.re.lo = dt_re_lo[ind];                                            \
+  f128x2_reg.im.hi = dt_im_hi[ind];                                            \
+  f128x2_reg.im.lo = dt_im_lo[ind]
+
+#define F128x2_TO_F64x4(f128x2_reg, ind)                                       \
+  dt_re_hi[ind] = f128x2_reg.re.hi;                                            \
+  dt_re_lo[ind] = f128x2_reg.re.lo;                                            \
+  dt_im_hi[ind] = f128x2_reg.im.hi;                                            \
+  dt_im_lo[ind] = f128x2_reg.im.lo
+
+// zl - left part of butterfly operation
+// zr - right part of butterfly operation
+// re - real part
+// im - imaginary part
+// hi - high bits
+// lo - low bits
+// dt - list
+// cf - single coefficient
+template <class params>
+__device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
+                                            double *dt_im_hi,
+                                            double *dt_im_lo) {
+
+  __syncthreads();
+  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
+  constexpr Index LOG2_DEGREE = params::log2_degree;
+  constexpr Index HALF_DEGREE = params::degree >> 1;
+  constexpr Index STRIDE = params::degree / params::opt;
+
+  f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
+
+  Index tid = threadIdx.x;
+
+  // debug
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("BUTTERFLY_DEPTH %d\n", BUTTERFLY_DEPTH);
+    printf("LOG2_DEGREE %d\n", LOG2_DEGREE);
+    printf("HALF_DEGREE %d\n", HALF_DEGREE);
+    printf("STRIDE %d\n", STRIDE);
+    printf("Params::degree %d\n", params::degree);
+    printf("opt %d\n", params::opt);
+  }
+  __syncthreads();
+
+  // load into registers
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    F64x4_TO_F128x2(u[i], tid);
+    F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
+    //    u[i].re.hi = dt_re_hi[tid];
+    //    u[i].re.lo = dt_re_lo[tid];
+    //    u[i].im.hi = dt_im_hi[tid];
+    //    u[i].im.lo = dt_im_lo[tid];
+
+    //    v[i].re.hi = dt_re_hi[tid + HALF_DEGREE];
+    //    v[i].re.lo = dt_re_lo[tid + HALF_DEGREE];
+    //    v[i].im.hi = dt_im_hi[tid + HALF_DEGREE];
+    //    v[i].im.lo = dt_im_lo[tid + HALF_DEGREE];
+
+    //    F64x4_TO_F128x2(u[i], tid);
+    //    F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
+    tid += STRIDE;
+  }
+
+  // level 1
+  // we don't make actual complex multiplication on level1 since we have only
+  // one twiddle, it's real and image parts are equal, so we can multiply
+  // it with simpler operations
+
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    auto ww = NEG_TWID(1);
+    f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, NEG_TWID(1).re,
+                               NEG_TWID(1).im);
+    //    w = v[i] * NEG_TWID(1);
+
+    //    __syncthreads();
+    //    if (threadIdx.x == 0 && blockIdx.x == 0) {
+    //      printf("w = %.5f %.5f %.5f %.5f\n", ww.re.hi, ww.re.lo, ww.im.hi,
+    //      ww.im.lo); printf("u = %.5f %.5f %.5f %.5f\n", u[i].re.hi,
+    //      u[i].re.lo, u[i].im.hi, u[i].im.lo); printf("v = %.5f %.5f %.5f
+    //      %.5f\n", v[i].re.hi, v[i].re.lo, v[i].im.hi, v[i].im.lo); printf("wv
+    //      = %.5f %.5f %.5f %.5f\n", w.re.hi, w.re.lo, w.im.hi, w.im.lo);
+    //    }
+    //    __syncthreads();
+    //    v[i] = u[i] - w;
+    //    u[i] = u[i] + w;
+
+    f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re, w.im);
+    f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re, w.im);
+  }
+
+  //  tid = threadIdx.x;
+  // #pragma unroll
+  //  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+  //    F128x2_TO_F64x4(u[i], (tid));
+  //    F128x2_TO_F64x4(v[i], (tid + HALF_DEGREE));
+  //    tid = tid + STRIDE;
+  //  }
+  //  __syncthreads();
+
+  Index twiddle_shift = 1;
+  int ii = 0;
+  for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
+    ii++;
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    twiddle_shift <<= 1;
+
+    tid = threadIdx.x;
+    __syncthreads();
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      tid = tid + STRIDE;
+    }
+    __syncthreads();
+
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      F64x4_TO_F128x2(w, tid ^ lane_mask);
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+      w = NEG_TWID(tid / lane_mask + twiddle_shift);
+
+      // w *= v[i];
+      f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
+      f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
+                                 w.im);
+      f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re,
+                                 w.im);
+      tid = tid + STRIDE;
+    }
+  }
+  __syncthreads();
+
+  //   store registers in SM
+  tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+    F128x2_TO_F64x4(u[i], tid * 2);
+    F128x2_TO_F64x4(v[i], (tid * 2 + 1));
+    tid = tid + STRIDE;
+  }
+  __syncthreads();
+}
+
+template <class params>
+__device__ void negacyclic_inverse_fft_f128(double *dt_re_hi, double *dt_re_lo,
+                                            double *dt_im_hi,
+                                            double *dt_im_lo) {
+  __syncthreads();
+  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
+  constexpr Index LOG2_DEGREE = params::log2_degree;
+  constexpr Index DEGREE = params::degree;
+  constexpr Index HALF_DEGREE = params::degree >> 1;
+  constexpr Index STRIDE = params::degree / params::opt;
+
+  size_t tid = threadIdx.x;
+  f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
+
+  // load into registers and divide by compressed polynomial size
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+
+    F64x4_TO_F128x2(u[i], 2 * tid);
+    F64x4_TO_F128x2(v[i], 2 * tid + 1);
+
+    tid += STRIDE;
+  }
+
+  Index twiddle_shift = DEGREE;
+  for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    tid = threadIdx.x;
+    twiddle_shift >>= 1;
+
+    // at this point registers are ready for the  butterfly
+    tid = threadIdx.x;
+    __syncthreads();
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      w = (u[i] - v[i]);
+      u[i] += v[i];
+      v[i] = w * NEG_TWID(tid / lane_mask + twiddle_shift).conjugate();
+
+      // keep one of the register for next iteration and store another one in sm
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      F128x2_TO_F64x4((u_stays_in_register) ? v[i] : u[i], tid);
+
+      tid = tid + STRIDE;
+    }
+    __syncthreads();
+
+    // prepare registers for next butterfly iteration
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      F64x4_TO_F128x2(w, tid ^ lane_mask);
+
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+
+      tid = tid + STRIDE;
+    }
+  }
+
+  // last iteration
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    w = (u[i] - v[i]);
+    u[i] = u[i] + v[i];
+    v[i] = w * NEG_TWID(1).conjugate();
+  }
+  __syncthreads();
+  // store registers in SM
+  tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+    F128x2_TO_F64x4(u[i], tid);
+    F128x2_TO_F64x4(v[i], tid + HALF_DEGREE);
+
+    tid = tid + STRIDE;
+  }
+  __syncthreads();
+}
+
+// params is expected to be full degree not half degree
+template <class params>
+__device__ void convert_u128_to_f128_as_integer(
+    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
+    const __uint128_t *in_re, const __uint128_t *in_im) {
+
+  Index tid = threadIdx.x;
+  // #pragma unroll
+  for (Index i = 0; i < params::opt / 2; i++) {
+    __syncthreads();
+    auto out_re = u128_to_signed_to_f128(in_re[tid]);
+    __syncthreads();
+    auto out_im = u128_to_signed_to_f128(in_im[tid]);
+    __syncthreads();
+
+    out_re_hi[tid] = out_re.hi;
+    out_re_lo[tid] = out_re.lo;
+    out_im_hi[tid] = out_im.hi;
+    out_im_lo[tid] = out_im.lo;
+
+    //    __syncthreads();
+    //    if (threadIdx.x == 0 && blockIdx.x == 0) {
+    //      printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
+    //      out_im_hi[tid],
+    //             out_im_lo[tid]);
+    //    }
+    //    __syncthreads();
+    tid += params::degree / params::opt;
+  }
+}
+
+// params is expected to be full degree not half degree
+template <class params>
+__device__ void convert_u128_to_f128_as_torus(
+    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
+    const __uint128_t *in_re, const __uint128_t *in_im) {
+
+  const double normalization = pow(2., -128.);
+  Index tid = threadIdx.x;
+  // #pragma unroll
+  for (Index i = 0; i < params::opt / 2; i++) {
+    __syncthreads();
+    auto out_re = u128_to_signed_to_f128(in_re[tid]);
+    __syncthreads();
+    auto out_im = u128_to_signed_to_f128(in_im[tid]);
+    __syncthreads();
+
+    out_re_hi[tid] = out_re.hi * normalization;
+    out_re_lo[tid] = out_re.lo * normalization;
+    out_im_hi[tid] = out_im.hi * normalization;
+    out_im_lo[tid] = out_im.lo * normalization;
+
+    //    __syncthreads();
+    //    if (threadIdx.x == 0 && blockIdx.x == 0) {
+    //      printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
+    //      out_im_hi[tid],
+    //             out_im_lo[tid]);
+    //    }
+    //    __syncthreads();
+    tid += params::degree / params::opt;
+  }
+}
+
+// params is expected to be full degree not half degree
+template <class params>
+__global__ void
+batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
+                                      double *out_im_hi, double *out_im_lo,
+                                      const __uint128_t *in) {
+
+  convert_u128_to_f128_as_integer<params>(
+      &out_re_hi[blockIdx.x * params::degree / 2],
+      &out_re_lo[blockIdx.x * params::degree / 2],
+      &out_im_hi[blockIdx.x * params::degree / 2],
+      &out_im_lo[blockIdx.x * params::degree / 2],
+      &in[blockIdx.x * params::degree],
+      &in[blockIdx.x * params::degree + params::degree / 2]);
+}
+
+// params is expected to be full degree not half degree
+template <class params>
+__global__ void
+batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
+                                    double *out_im_hi, double *out_im_lo,
+                                    const __uint128_t *in) {
+
+  convert_u128_to_f128_as_torus<params>(
+      &out_re_hi[blockIdx.x * params::degree / 2],
+      &out_re_lo[blockIdx.x * params::degree / 2],
+      &out_im_hi[blockIdx.x * params::degree / 2],
+      &out_im_lo[blockIdx.x * params::degree / 2],
+      &in[blockIdx.x * params::degree],
+      &in[blockIdx.x * params::degree + params::degree / 2]);
+}
+
+template <class params, sharedMemDegree SMD>
+__global__ void
+batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
+                 double *in_im_lo, double *out_re_hi, double *out_re_lo,
+                 double *out_im_hi, double *out_im_lo, double *buffer) {
+  extern __shared__ double sharedMemoryFFT[];
+  double *re_hi, *re_lo, *im_hi, *im_lo;
+
+  // debug
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("Params::degree %d\n", params::degree);
+    printf("opt %d\n", params::opt);
+  }
+  __syncthreads();
+
+  if (SMD == NOSM) {
+    re_hi =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
+    re_lo =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
+    im_hi =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
+    im_lo =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
+  } else {
+    re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
+    re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
+    im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
+    im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
+  }
+
+  Index tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < params::opt / 2; ++i) {
+    re_hi[tid] = in_re_hi[blockIdx.x * (params::degree / 2) + tid];
+    re_lo[tid] = in_re_lo[blockIdx.x * (params::degree / 2) + tid];
+    im_hi[tid] = in_im_hi[blockIdx.x * (params::degree / 2) + tid];
+    im_lo[tid] = in_im_lo[blockIdx.x * (params::degree / 2) + tid];
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+  negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
+  __syncthreads();
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < params::opt / 2; ++i) {
+    out_re_hi[blockIdx.x * (params::degree / 2) + tid] = re_hi[tid];
+    out_re_lo[blockIdx.x * (params::degree / 2) + tid] = re_lo[tid];
+    out_im_hi[blockIdx.x * (params::degree / 2) + tid] = im_hi[tid];
+    out_im_lo[blockIdx.x * (params::degree / 2) + tid] = im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
+void print_uint128_bits(__uint128_t value) {
+  char buffer[129];   // 128 bits + null terminator
+  buffer[128] = '\0'; // Null-terminate the string
+
+  for (int i = 127; i >= 0; --i) {
+    buffer[i] = (value & 1) ? '1' : '0'; // Extract the least significant bit
+    value >>= 1;                         // Shift right by 1 bit
+  }
+
+  printf("%s\n", buffer);
+}
+
+template <class params>
+__host__ void host_fourier_transform_forward_as_integer_f128(
+    cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
+    double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
+    const uint32_t number_of_samples) {
+
+  //  for (int i = 0; i < N / 2; i++)
+  //  {
+  //    printf("%.10f\n", re0[i]);
+  //  }
+  //  printf("cpp_poly_host\n");
+  //  for (int i = 0; i < N; i++) {
+  //    print_uint128_bits(standard[i]);
+  //  }
+  //  printf("check #1\n");
+
+  //  for (int i = 0; i < 32; i++) {
+  //    standard[i + 32] = standard[i];
+  //  }
+
+  // allocate device buffers
+  double *d_re0 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_re1 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_im0 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_im1 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  __uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
+      N * sizeof(__uint128_t), stream, gpu_index);
+
+  // copy input into device
+  cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
+                           stream, gpu_index);
+
+  // setup launch parameters
+  size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
+  int grid_size = number_of_samples;
+  int block_size = params::degree / params::opt;
+  bool full_sm =
+      (required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
+  size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
+  size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
+  double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  // configure shared memory for batch fft kernel
+  if (full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  // convert u128 into 4 x double
+  batch_convert_u128_to_f128_as_integer<params>
+      <<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
+                                             d_standard);
+
+  // call negacyclic 128 bit forward fft.
+  if (full_sm) {
+    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(
+            d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  } else {
+    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(
+            d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  }
+
+  cudaDeviceSynchronize();
+
+  //  print_debug("re_hi", d_re0, 32);
+  //  print_debug("d_re_lo", d_re1, 32);
+  //  print_debug("d_im_hi", d_im0, 32);
+  //  print_debug("d_im_lo", d_im1, 32);
+
+  cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
+                           gpu_index);
+
+  cuda_drop_async(d_standard, stream, gpu_index);
+  cuda_drop_async(d_re0, stream, gpu_index);
+  cuda_drop_async(d_re1, stream, gpu_index);
+  cuda_drop_async(d_im0, stream, gpu_index);
+  cuda_drop_async(d_im1, stream, gpu_index);
+
+  cudaDeviceSynchronize();
+
+  //  printf("params::degree: %d\n", params::degree);
+  //  printf("params::opt: %d\n", params::opt);
+  //  printf("N: %d\n", N);
+  //  for (int i = 0; i < N; i++)
+  //  {
+  //    printf("%s\n", to_string_128(standard[i]).c_str());
+  //  }
+  //
+  //  for (int i = 0; i < N / 2; i++) {
+  ////    auto re = u128_to_signed_to_f128(standard[i]);
+  ////    auto im = u128_to_signed_to_f128(standard[i + N / 2]);
+  ////    printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
+  //    printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
+  //  }
+}
+
+__global__ void print_twiddles(int N) {
+  for (int i = 0; i < N / 2; i++) {
+    printf("%.73f %.73f %.73f %.73f\n", neg_twiddles_re_hi[i],
+           neg_twiddles_re_lo[i], neg_twiddles_im_hi[i], neg_twiddles_im_lo[i]);
+  }
+}
+
+__global__ void print_c128(double *re0, double *re1, double *im0, double *im1,
+                           int N) {
+  for (int i = 0; i < N / 2; i++) {
+    printf("%.73f %.73f %.73f %.73f\n", re0[i], re1[i], im0[i], im1[i]);
+  }
+}
+
+template <class params>
+__host__ void host_fourier_transform_forward_as_torus_f128(
+    cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
+    double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
+    const uint32_t number_of_samples) {
+
+  print_twiddles<<<1, 1>>>(N);
+  cudaDeviceSynchronize();
+  //  for (int i = 0; i < N / 2; i++)
+  //  {
+  //    printf("%.10f\n", re0[i]);
+  //  }
+  //  printf("cpp_poly_host\n");
+  //  for (int i = 0; i < N; i++) {
+  //    print_uint128_bits(standard[i]);
+  //  }
+  //  printf("check #1\n");
+
+  //  for (int i = 0; i < 32; i++) {
+  //    standard[i + 32] = standard[i];
+  //  }
+
+  // allocate device buffers
+  double *d_re0 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_re1 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_im0 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_im1 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  __uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
+      N * sizeof(__uint128_t), stream, gpu_index);
+
+  // copy input into device
+  cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
+                           stream, gpu_index);
+
+  // setup launch parameters
+  size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
+  int grid_size = number_of_samples;
+  int block_size = params::degree / params::opt;
+  bool full_sm =
+      (required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
+  size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
+  size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
+  double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  // configure shared memory for batch fft kernel
+  if (full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  // convert u128 into 4 x double
+  batch_convert_u128_to_f128_as_torus<params>
+      <<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
+                                             d_standard);
+  print_c128<<<1, 1>>>(d_re0, d_re1, d_im0, d_im1, N);
+  cudaDeviceSynchronize();
+  // call negacyclic 128 bit forward fft.
+  if (full_sm) {
+    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(
+            d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  } else {
+    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(
+            d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  }
+
+  cudaDeviceSynchronize();
+
+  //  print_debug("re_hi", d_re0, 32);
+  //  print_debug("d_re_lo", d_re1, 32);
+  //  print_debug("d_im_hi", d_im0, 32);
+  //  print_debug("d_im_lo", d_im1, 32);
+
+  cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
+                           gpu_index);
+
+  cuda_drop_async(d_standard, stream, gpu_index);
+  cuda_drop_async(d_re0, stream, gpu_index);
+  cuda_drop_async(d_re1, stream, gpu_index);
+  cuda_drop_async(d_im0, stream, gpu_index);
+  cuda_drop_async(d_im1, stream, gpu_index);
+
+  cudaDeviceSynchronize();
+
+  //  printf("params::degree: %d\n", params::degree);
+  //  printf("params::opt: %d\n", params::opt);
+  //  printf("N: %d\n", N);
+  //  for (int i = 0; i < N; i++)
+  //  {
+  //    printf("%s\n", to_string_128(standard[i]).c_str());
+  //  }
+
+  //  for (int i = 0; i < N / 2; i++) {
+  ////    auto re = u128_to_signed_to_f128(standard[i]);
+  ////    auto im = u128_to_signed_to_f128(standard[i + N / 2]);
+  ////    printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
+  //    printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
+  //  }
+}
+
+template <class params>
+__host__ void host_fourier_transform_backward_as_torus_f128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *standard,
+    double const *re0, double const *re1, double const *im0, double const *im1,
+    const uint32_t N, const uint32_t number_of_samples) {
+
+  // allocate device buffers
+  double *d_re0 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_re1 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_im0 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  double *d_im1 =
+      (double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
+  __uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
+      N * sizeof(__uint128_t), stream, gpu_index);
+
+  //  // copy input into device
+  cuda_memcpy_async_to_gpu(d_re0, standard, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_gpu(d_re1, standard, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_gpu(d_im0, standard, N / 2 * sizeof(double), stream,
+                           gpu_index);
+  cuda_memcpy_async_to_gpu(d_im1, standard, N / 2 * sizeof(double), stream,
+                           gpu_index);
+
+  // setup launch parameters
+  size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
+  int grid_size = number_of_samples;
+  int block_size = params::degree / params::opt;
+  bool full_sm =
+      (required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
+  size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
+  size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
+  double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  // configure shared memory for batch fft kernel
+  if (full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  //  // convert u128 into 4 x double
+  //  batch_convert_u128_to_f128_as_torus<params><<<grid_size, block_size, 0,
+  //  stream>>>(
+  //      d_re0, d_re1, d_im0, d_im1, d_standard);
+
+  // call negacyclic 128 bit forward fft.
+  //  if (full_sm) {
+  //    negacyclic_inverse_fft_f128<FFTDegree<params, ForwardFFT>,
+  //    FULLSM><<<grid_size, block_size, shared_memory_size, stream>>>
+  //        (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  //  } else {
+  //    batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM><<<grid_size,
+  //    block_size, shared_memory_size, stream>>>
+  //        (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
+  //
+  //  }
+  //
+  //  cudaDeviceSynchronize();
+
+  ////  print_debug("re_hi", d_re0, 32);
+  ////  print_debug("d_re_lo", d_re1, 32);
+  ////  print_debug("d_im_hi", d_im0, 32);
+  ////  print_debug("d_im_lo", d_im1, 32);
+  //
+
+  cuda_memcpy_async_to_cpu(standard, d_standard, N * sizeof(__uint128_t),
+                           stream, gpu_index);
+  cuda_drop_async(d_standard, stream, gpu_index);
+  cuda_drop_async(d_re0, stream, gpu_index);
+  cuda_drop_async(d_re1, stream, gpu_index);
+  cuda_drop_async(d_im0, stream, gpu_index);
+  cuda_drop_async(d_im1, stream, gpu_index);
+
+  cudaDeviceSynchronize();
+  //
+  //  printf("params::degree: %d\n", params::degree);
+  //  printf("params::opt: %d\n", params::opt);
+  //  printf("N: %d\n", N);
+  //  for (int i = 0; i < N; i++)
+  //  {
+  //    printf("%s\n", to_string_128(standard[i]).c_str());
+  //  }
+  //
+  //  for (int i = 0; i < N / 2; i++) {
+  ////    auto re = u128_to_signed_to_f128(standard[i]);
+  ////    auto im = u128_to_signed_to_f128(standard[i + N / 2]);
+  ////    printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
+  //    printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
+  //  }
+  //
+}
+
+#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cuh
@@ -0,0 +1,11 @@
+#ifndef GPU_BOOTSTRAP_128_TWIDDLES_CUH
+#define GPU_BOOTSTRAP_128_TWIDDLES_CUH
+
+/*
+ * 'negtwiddles' are stored in device memory to profit caching
+ */
+extern __device__ double neg_twiddles_re_hi[4096];
+extern __device__ double neg_twiddles_re_lo[4096];
+extern __device__ double neg_twiddles_im_hi[4096];
+extern __device__ double neg_twiddles_im_lo[4096];
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -0,0 +1,43 @@
+#include "integer/abs.cuh"
+
+void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_abs_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks) {
+
+  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
+
+  host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
+                                gpu_count, static_cast<uint64_t *>(ct), bsks,
+                                (uint64_t **)(ksks), mem, is_signed,
+                                num_blocks);
+}
+
+void cleanup_cuda_integer_abs_inplace(void *const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count,
+                                      int8_t **mem_ptr_void) {
+  int_abs_buffer<uint64_t> *mem_ptr =
+      (int_abs_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -0,0 +1,69 @@
+#ifndef TFHE_RS_ABS_CUH
+#define TFHE_RS_ABS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer/bitwise_ops.cuh"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/integer_utilities.h"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "pbs/programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_abs_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
+    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
+
+  if (is_signed)
+    *mem_ptr =
+        new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
+                                  num_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void
+host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                    uint32_t gpu_count, Torus *ct, void *const *bsks,
+                    uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
+                    bool is_signed, uint32_t num_blocks) {
+  if (!is_signed)
+    return;
+
+  auto radix_params = mem_ptr->params;
+  auto mask = mem_ptr->mask;
+
+  auto big_lwe_dimension = radix_params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  uint32_t num_bits_in_ciphertext =
+      (31 - __builtin_clz(radix_params.message_modulus)) * num_blocks;
+
+  cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
+                               streams[0], gpu_indexes[0]);
+
+  host_integer_radix_arithmetic_scalar_shift_kb_inplace(
+      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
+      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
+                       radix_params.big_lwe_dimension, num_blocks);
+
+  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
+                                     nullptr, nullptr, mem_ptr->scp_mem, bsks,
+                                     ksks, num_blocks);
+
+  host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
+                              mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
+}
+
+#endif // TFHE_RS_ABS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -127,6 +127,16 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(

  // phase 3
  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
+  if (op == SIGNED_OPERATION::SUBTRACTION && num_blocks == 1) {
+    // Quick fix for the case where the subtraction is done on a single block
+    Torus *one_scalar =
+        (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
+    cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], one_scalar, 1, 1);
+    create_trivial_radix<Torus>(
+        streams[0], gpu_indexes[0], input_carry, one_scalar, big_lwe_dimension,
+        1, 1, radix_params.message_modulus, radix_params.carry_modulus);
+    cuda_drop_async(one_scalar, streams[0], gpu_indexes[0]);
+  }

  host_resolve_signed_overflow<Torus>(
      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -14,27 +14,14 @@ __host__ void zero_out_if(cudaStream_t const *streams,
  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

-  int big_lwe_size = params.big_lwe_dimension + 1;
-
-  // Left message is shifted
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = (params.big_lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-
  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
-  // second operand is fixed
+  // second operand is not an array
  auto tmp_lwe_array_input = mem_ptr->tmp;
-  for (int i = 0; i < num_radix_blocks; i++) {
-    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
-    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
-
-    device_pack_bivariate_blocks<Torus>
-        <<<num_blocks, num_threads, 0, streams[0]>>>(
-            lwe_array_out_block, predicate->lwe_indexes_in,
-            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
-            params.big_lwe_dimension, params.message_modulus, 1);
-    check_cuda_error(cudaGetLastError());
-  }
+  pack_bivariate_blocks_with_single_block<Torus>(
+      streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
+      predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
+      predicate->lwe_indexes_in, params.big_lwe_dimension,
+      params.message_modulus, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
@@ -56,10 +43,7 @@ __host__ void host_integer_radix_cmux_kb(
  auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
  auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
  for (uint j = 0; j < gpu_count; j++) {
-    // cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-    cudaEventRecord(mem_ptr->ingoing_events[j], streams[j]);
-    cudaStreamWaitEvent(true_streams[j], mem_ptr->ingoing_events[j], 0);
-    cudaStreamWaitEvent(false_streams[j], mem_ptr->ingoing_events[j], 0);
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }

  auto mem_true = mem_ptr->zero_if_true_buffer;
@@ -67,29 +51,16 @@ __host__ void host_integer_radix_cmux_kb(
                     lwe_array_true, lwe_condition, mem_true,
                     mem_ptr->inverted_predicate_lut, bsks, ksks,
                     num_radix_blocks);
-  for (uint j = 0; j < gpu_count; j++) {
-    cudaEventRecord(mem_ptr->outgoing_events1[j], true_streams[j]);
-  }
-
  auto mem_false = mem_ptr->zero_if_false_buffer;
  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
                     num_radix_blocks);
-  for (uint j = 0; j < gpu_count; j++) {
-    cudaEventRecord(mem_ptr->outgoing_events2[j], false_streams[j]);
+  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
+    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
  }
-  // for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
-  //   cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
-  // }
-  // for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++)
-  // {
-  //   cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
-  // }
-
-  for (uint j = 0; j < gpu_count; j++) {
-    cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events1[j], 0);
-    cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events2[j], 0);
+  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
+    cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
  }

  // If the condition was true, true_ct will have kept its value and false_ct
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -2,11 +2,12 @@

 void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -14,7 +15,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_div_rem_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
      (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }
@@ -22,7 +23,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *quotient, void *remainder, void const *numerator, void const *divisor,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    uint32_t num_blocks) {

  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
@@ -31,8 +32,8 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
      static_cast<const uint64_t *>(numerator),
-      static_cast<const uint64_t *>(divisor), bsks, (uint64_t **)(ksks), mem,
-      num_blocks);
+      static_cast<const uint64_t *>(divisor), is_signed, bsks,
+      (uint64_t **)(ksks), mem, num_blocks);
 }

 void cleanup_cuda_integer_div_rem(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -3,6 +3,7 @@

 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "integer/abs.cuh"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
@@ -161,22 +162,21 @@ template <typename Torus> struct lwe_ciphertext_list {
 template <typename Torus>
 __host__ void scratch_cuda_integer_div_rem_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_div_rem_memory<Torus> **mem_ptr,
+    uint32_t gpu_count, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {

-  *mem_ptr = new int_div_rem_memory<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
+  *mem_ptr =
+      new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                    is_signed, num_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
-__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, Torus *quotient,
-                                      Torus *remainder, Torus const *numerator,
-                                      Torus const *divisor, void *const *bsks,
-                                      uint64_t *const *ksks,
-                                      int_div_rem_memory<uint64_t> *mem_ptr,
-                                      uint32_t num_blocks) {
+__host__ void host_unsigned_integer_div_rem_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *quotient, Torus *remainder,
+    Torus const *numerator, Torus const *divisor, void *const *bsks,
+    uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr,
+    uint32_t num_blocks) {

  auto radix_params = mem_ptr->params;

@@ -375,16 +375,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    }; // left_shift_interesting_remainder2

    for (uint j = 0; j < gpu_count; j++) {
-      cudaEventRecord(mem_ptr->ingoing_events1[j], streams[j]);
-      // cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_1[j],
-                          mem_ptr->ingoing_events1[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_2[j],
-                          mem_ptr->ingoing_events1[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_3[j],
-                          mem_ptr->ingoing_events1[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_4[j],
-                          mem_ptr->ingoing_events1[j], 0);
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
    // interesting_divisor
    trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
@@ -398,21 +389,11 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // interesting_remainder2
    left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
                                      gpu_count);
-
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
-      cudaEventRecord(mem_ptr->outgoing_events1[j], mem_ptr->sub_streams_1[j]);
-      cudaEventRecord(mem_ptr->outgoing_events2[j], mem_ptr->sub_streams_2[j]);
-      cudaEventRecord(mem_ptr->outgoing_events3[j], mem_ptr->sub_streams_3[j]);
-      cudaEventRecord(mem_ptr->outgoing_events4[j], mem_ptr->sub_streams_4[j]);
-
-      // cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events1[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events2[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events3[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events4[j], 0);
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
    }

    // if interesting_remainder1 != 0 -> interesting_remainder2 == 0
@@ -444,16 +425,11 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    auto do_overflowing_sub = [&](cudaStream_t const *streams,
                                  uint32_t const *gpu_indexes,
                                  uint32_t gpu_count) {
-      uint32_t compute_borrow = 1;
-      uint32_t uses_input_borrow = 0;
-      mem_ptr->overflow_sub_mem->update_lut_indexes(
-          streams, gpu_indexes, merged_interesting_remainder.len);
-      host_integer_overflowing_sub<uint64_t>(
+      host_integer_overflowing_sub_kb<Torus>(
          streams, gpu_indexes, gpu_count, new_remainder.data,
-          (uint64_t *)merged_interesting_remainder.data,
-          interesting_divisor.data, subtraction_overflowed.data,
-          (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
-          merged_interesting_remainder.len, compute_borrow, uses_input_borrow);
+          subtraction_overflowed.data, merged_interesting_remainder.data,
+          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
+          merged_interesting_remainder.len);
    };

    // fills:
@@ -502,14 +478,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,

    // phase 2
    for (uint j = 0; j < gpu_count; j++) {
-      cudaEventRecord(mem_ptr->ingoing_events2[j], streams[j]);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_1[j],
-                          mem_ptr->ingoing_events2[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_2[j],
-                          mem_ptr->ingoing_events2[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_3[j],
-                          mem_ptr->ingoing_events2[j], 0);
-      // cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
    // new_remainder
    // subtraction_overflowed
@@ -520,15 +489,9 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
                                             gpu_indexes, gpu_count);
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
-      cudaEventRecord(mem_ptr->outgoing_events5[j], mem_ptr->sub_streams_1[j]);
-      cudaEventRecord(mem_ptr->outgoing_events6[j], mem_ptr->sub_streams_2[j]);
-      cudaEventRecord(mem_ptr->outgoing_events7[j], mem_ptr->sub_streams_3[j]);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events5[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events6[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events7[j], 0);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
@@ -583,14 +546,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    };

    for (uint j = 0; j < gpu_count; j++) {
-      cudaEventRecord(mem_ptr->ingoing_events3[j], streams[j]);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_1[j],
-                          mem_ptr->ingoing_events3[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_2[j],
-                          mem_ptr->ingoing_events3[j], 0);
-      cudaStreamWaitEvent(mem_ptr->sub_streams_3[j],
-                          mem_ptr->ingoing_events3[j], 0);
-      // cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
    // cleaned_merged_interesting_remainder
    conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
@@ -601,15 +557,9 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
    // quotient
    set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
-      cudaEventRecord(mem_ptr->outgoing_events8[j], mem_ptr->sub_streams_1[j]);
-      cudaEventRecord(mem_ptr->outgoing_events9[j], mem_ptr->sub_streams_2[j]);
-      cudaEventRecord(mem_ptr->outgoing_events10[j], mem_ptr->sub_streams_3[j]);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events8[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events9[j], 0);
-      cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events10[j], 0);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
-      // cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

    assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
@@ -644,4 +594,105 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
  }
 }

+template <typename Torus>
+__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count, Torus *quotient,
+                                      Torus *remainder, Torus const *numerator,
+                                      Torus const *divisor, bool is_signed,
+                                      void *const *bsks, uint64_t *const *ksks,
+                                      int_div_rem_memory<uint64_t> *int_mem_ptr,
+                                      uint32_t num_blocks) {
+
+  if (is_signed) {
+    auto radix_params = int_mem_ptr->params;
+    uint32_t big_lwe_size = radix_params.big_lwe_dimension + 1;
+
+    // temporary memory
+    lwe_ciphertext_list<Torus> positive_numerator(
+        int_mem_ptr->positive_numerator, radix_params, num_blocks);
+    lwe_ciphertext_list<Torus> positive_divisor(int_mem_ptr->positive_divisor,
+                                                radix_params, num_blocks);
+
+    positive_numerator.clone_from((Torus *)numerator, 0, num_blocks - 1,
+                                  streams[0], gpu_indexes[0]);
+    positive_divisor.clone_from((Torus *)divisor, 0, num_blocks - 1, streams[0],
+                                gpu_indexes[0]);
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
+                               gpu_count, positive_numerator.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_1, true, num_blocks);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
+                               gpu_count, positive_divisor.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_2, true, num_blocks);
+    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+
+    host_unsigned_integer_div_rem_kb<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, remainder,
+        positive_numerator.data, positive_divisor.data, bsks, ksks,
+        int_mem_ptr->unsigned_mem, num_blocks);
+
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+        int_mem_ptr->sign_bits_are_different,
+        &numerator[big_lwe_size * (num_blocks - 1)],
+        &divisor[big_lwe_size * (num_blocks - 1)], bsks, ksks, 1,
+        int_mem_ptr->compare_signed_bits_lut,
+        int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
+
+    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+
+    host_integer_radix_negation(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+        int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
+        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
+
+    host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
+                                       gpu_count, int_mem_ptr->negated_quotient,
+                                       nullptr, nullptr, int_mem_ptr->scp_mem_1,
+                                       bsks, ksks, num_blocks);
+
+    host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
+                                gpu_count, int_mem_ptr->negated_remainder,
+                                remainder, radix_params.big_lwe_dimension,
+                                num_blocks, radix_params.message_modulus,
+                                radix_params.carry_modulus);
+
+    host_propagate_single_carry<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+        int_mem_ptr->negated_remainder, nullptr, nullptr,
+        int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks);
+
+    host_integer_radix_cmux_kb<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
+        int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
+        quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks);
+
+    host_integer_radix_cmux_kb<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
+        &numerator[big_lwe_size * (num_blocks - 1)],
+        int_mem_ptr->negated_remainder, remainder,
+        int_mem_ptr->cmux_remainder_mem, bsks, ksks, num_blocks);
+
+    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+  } else {
+    host_unsigned_integer_div_rem_kb<Torus>(
+        streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
+        divisor, bsks, ksks, int_mem_ptr->unsigned_mem, num_blocks);
+  }
+}
+
 #endif // TFHE_RS_DIV_REM_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,5 +1,4 @@
 #include "integer/integer.cuh"
-#include "integer/negation.cuh"
 #include <linear_algebra.h>

 void cuda_full_propagation_64_inplace(void *const *streams,
@@ -63,46 +62,6 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
      allocate_gpu_memory);
 }

-void scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_fast_propagate_single_carry_kb_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_fast_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      requested_flag, uses_carry, allocate_gpu_memory);
-}
-
-void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
-    bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_overflowing_sub<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_fast_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      compute_overflow, allocate_gpu_memory);
-}
-
 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
@@ -114,37 +73,6 @@ void cuda_propagate_single_carry_kb_64_inplace(
      (uint64_t **)(ksks), num_blocks);
 }

-void cuda_fast_propagate_single_carry_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks,
-    uint32_t requested_flag, uint32_t uses_carry) {
-
-  host_fast_propagate_single_carry<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
-      static_cast<const uint64_t *>(carry_in),
-      (int_fast_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      num_blocks, requested_flag, uses_carry);
-}
-
-void cuda_integer_overflowing_sub_kb_64_inplace(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs_array, const void *rhs_array, void *overflow_block,
-    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
-    uint32_t uses_input_borrow) {
-
-  host_integer_overflowing_sub<uint64_t>(
-      (cudaStream_t const *)streams, gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
-      static_cast<const uint64_t *>(rhs_array),
-      static_cast<uint64_t *>(overflow_block),
-      static_cast<const uint64_t *>(input_borrow),
-      (int_fast_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      num_blocks, compute_overflow, uses_input_borrow);
-}
-
 void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
@@ -166,24 +94,6 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

-void cleanup_cuda_fast_propagate_single_carry(void *const *streams,
-                                              uint32_t const *gpu_indexes,
-                                              uint32_t gpu_count,
-                                              int8_t **mem_ptr_void) {
-  int_fast_sc_prop_memory<uint64_t> *mem_ptr =
-      (int_fast_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
-
-void cleanup_cuda_integer_overflowing_sub(void *const *streams,
-                                          uint32_t const *gpu_indexes,
-                                          uint32_t gpu_count,
-                                          int8_t **mem_ptr_void) {
-  int_fast_borrow_prop_memory<uint64_t> *mem_ptr =
-      (int_fast_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
-
 void scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -8,7 +8,6 @@
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
-#include "linearalgebra/negation.cuh"
 #include "pbs/programmable_bootstrap.h"
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
@@ -129,134 +128,6 @@ host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
 }

-// If group_size = 4, the first group of 4 elements will be transformed as
-// follows:
-//  dest[0] = src[0]
-//  dest[1] = src[0] + src[1]
-//  dest[2] = src[0] + src[1] + src[2]
-//  dest[3] = src[0] + src[1] + src[2] + src[3]
-template <typename Torus>
-__global__ void
-radix_cumulative_sum_in_groups(Torus *dest, Torus *src, uint32_t blocks_count,
-                               uint32_t lwe_size, uint32_t group_size) {
-
-  size_t block_offset = blockIdx.x * group_size * lwe_size;
-
-  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
-    size_t idx = j + block_offset;
-    Torus sum = src[idx];
-    dest[idx] = sum;
-    for (int gidx = 1; gidx < group_size; gidx++) {
-      if (gidx + blockIdx.x * group_size <
-          blocks_count) { // in case the last group is not full
-        sum += src[idx + gidx * lwe_size];
-        dest[idx + gidx * lwe_size] = sum;
-      }
-    }
-  }
-}
-
-template <typename Torus>
-__host__ void host_radix_cumulative_sum_in_groups(
-    cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src,
-    uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) {
-  cudaSetDevice(gpu_index);
-  // Each CUDA block is responsible for a single group
-  int num_blocks = (radix_blocks_count + group_size - 1) / group_size,
-      num_threads = 512;
-  radix_cumulative_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      dest, src, radix_blocks_count, lwe_size, group_size);
-}
-
-template <typename Torus>
-__global__ void radix_split_simulators_and_grouping_pgns(
-    Torus *simulators, Torus *grouping_pgns, Torus *src, uint32_t blocks_count,
-    uint32_t lwe_size, uint32_t group_size, Torus delta) {
-
-  size_t block_offset = blockIdx.x * lwe_size;
-  if (blockIdx.x % group_size == 0) {
-    if (blockIdx.x == 0) {
-      // save trivial 0
-      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
-        simulators[j] = 0;
-      }
-    } else {
-      // save trivial 1
-      for (int j = threadIdx.x; j < lwe_size - 1; j += blockDim.x) {
-        size_t simu_idx = j + block_offset;
-        simulators[simu_idx] = 0;
-      }
-      if (threadIdx.x == 0) {
-        simulators[lwe_size - 1 + block_offset] = 1 * delta;
-      }
-    }
-
-    if ((blockIdx.x / group_size + 1) <
-        (blocks_count + group_size - 1) / group_size) {
-      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
-      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
-      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
-        size_t in_offset = j + src_offset;
-        size_t out_offset = j + pgns_offset;
-        grouping_pgns[out_offset] = src[in_offset];
-      }
-    }
-  } else {
-    // save simulators
-    size_t src_offset = (blockIdx.x - 1) * lwe_size;
-    for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
-      simulators[j + block_offset] = src[j + src_offset];
-    }
-  }
-}
-
-template <typename Torus>
-__host__ void host_radix_split_simulators_and_grouping_pgns(
-    cudaStream_t stream, uint32_t gpu_index, Torus *simulators,
-    Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count,
-    uint32_t lwe_size, uint32_t group_size, Torus delta) {
-  cudaSetDevice(gpu_index);
-  // Each CUDA block is responsible for a single group
-  int num_blocks = radix_blocks_count, num_threads = 512;
-  radix_split_simulators_and_grouping_pgns<Torus>
-      <<<num_blocks, num_threads, 0, stream>>>(simulators, grouping_pgns, src,
-                                               radix_blocks_count, lwe_size,
-                                               group_size, delta);
-}
-
-// If group_size = 4, the first group of 4 elements will be transformed as
-// follows:
-//  src1 size num_radix_blocks * lwe_size
-//  src2 size num_group * lwe_size
-//  dest[0] = src1[0] + src2[0]
-//  dest[1] = src1[1] + src2[0]
-//  dest[2] = src1[2] + src2[0]
-//  dest[3] = src1[3] + src2[0]
-template <typename Torus>
-__global__ void radix_sum_in_groups(Torus *dest, Torus *src1, Torus *src2,
-                                    uint32_t blocks_count, uint32_t lwe_size,
-                                    uint32_t group_size) {
-
-  size_t src1_offset = blockIdx.x * lwe_size;
-  size_t src2_index = (blockIdx.x / group_size) * lwe_size;
-  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
-    size_t idx = j + src1_offset;
-    dest[idx] = src1[idx] + src2[j + src2_index];
-  }
-}
-
-template <typename Torus>
-__host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
-                                       Torus *dest, Torus *src1, Torus *src2,
-                                       uint32_t radix_blocks_count,
-                                       uint32_t lwe_size, uint32_t group_size) {
-  cudaSetDevice(gpu_index);
-
-  int num_blocks = radix_blocks_count, num_threads = 512;
-  radix_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      dest, src1, src2, radix_blocks_count, lwe_size, group_size);
-}
-
 // polynomial_size threads
 template <typename Torus>
 __global__ void
@@ -271,8 +142,10 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out,
    int block_id = tid / (lwe_dimension + 1);
    int coeff_id = tid % (lwe_dimension + 1);

-    int pos_in = lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
-    int pos_out = lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
+    const int pos_in =
+        lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
+    const int pos_out =
+        lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
    lwe_array_out[pos_out] = lwe_array_1[pos_in] * shift + lwe_array_2[pos_in];
  }
 }
@@ -301,6 +174,50 @@ pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  check_cuda_error(cudaGetLastError());
 }

+// polynomial_size threads
+template <typename Torus>
+__global__ void device_pack_bivariate_blocks_with_single_block(
+    Torus *lwe_array_out, Torus const *lwe_indexes_out,
+    Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
+    uint32_t lwe_dimension, uint32_t shift, uint32_t num_blocks) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tid < num_blocks * (lwe_dimension + 1)) {
+    int block_id = tid / (lwe_dimension + 1);
+    int coeff_id = tid % (lwe_dimension + 1);
+
+    const int pos_in =
+        lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
+    const int pos_out =
+        lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
+    lwe_array_out[pos_out] = lwe_array_1[pos_in] * shift + lwe_2[coeff_id];
+  }
+}
+
+/* Combine lwe_array_1 and lwe_2 so that each block m1 and lwe_2
+ *  becomes out = m1 * shift + lwe_2
+ *
+ *  This is for the special case when one of the operands is not an array
+ */
+template <typename Torus>
+__host__ void pack_bivariate_blocks_with_single_block(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_indexes_out,
+    Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
+    uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) {
+
+  cudaSetDevice(gpu_indexes[0]);
+  // Left message is shifted
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = num_radix_blocks * (lwe_dimension + 1);
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  device_pack_bivariate_blocks_with_single_block<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(
+          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_2, lwe_indexes_in,
+          lwe_dimension, shift, num_radix_blocks);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -608,48 +525,6 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }

-template <typename Torus>
-void generate_many_lookup_table(
-    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t message_modulus, uint32_t carry_modulus,
-    std::vector<std::function<Torus(Torus)>> &functions) {
-
-  uint32_t modulus_sup = message_modulus * carry_modulus;
-  uint32_t box_size = polynomial_size / modulus_sup;
-  Torus delta = (1ul << 63) / modulus_sup;
-
-  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
-
-  auto body = &acc[glwe_dimension * polynomial_size];
-
-  size_t fn_counts = functions.size();
-
-  assert(fn_counts <= modulus_sup / 2);
-
-  // Space used for each sub lut
-  uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size;
-
-  // This accumulator extracts the carry bits
-  for (int f = 0; f < fn_counts; f++) {
-    int lut_offset = f * single_function_sub_lut_size;
-    for (int i = 0; i < modulus_sup / fn_counts; i++) {
-      int index = i * box_size + lut_offset;
-      for (int j = index; j < index + box_size; j++) {
-        auto f_eval = functions[f](i);
-        body[j] = f_eval * delta;
-      }
-    }
-  }
-  int half_box_size = box_size / 2;
-
-  // Negate the first half_box_size coefficients
-  for (int i = 0; i < half_box_size; i++) {
-    body[i] = -body[i];
-  }
-
-  rotate_left<Torus>(body, half_box_size, polynomial_size);
-}
-
 template <typename Torus>
 void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
                                     uint32_t polynomial_size,
@@ -783,37 +658,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
  free(h_lut);
 }

-/*
- *  generate many lut accumulator for device pointer
- *    v_stream - cuda stream
- *    acc - device pointer for accumulator
- *    ...
- *    vector<f> - evaluating functions with one Torus input
- */
-template <typename Torus>
-void generate_many_lut_device_accumulator(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
-    uint32_t carry_modulus,
-    std::vector<std::function<Torus(Torus)>> &functions) {
-
-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-  // fill accumulator
-  generate_many_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
-                                    message_modulus, carry_modulus, functions);
-
-  // copy host lut and lut_indexes_vec to device
-  cuda_memcpy_async_to_gpu(
-      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      stream, gpu_index);
-
-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
-}
-
 template <typename Torus>
 void scratch_cuda_propagate_single_carry_kb_inplace(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -826,108 +670,6 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
                                    num_radix_blocks, allocate_gpu_memory);
 }

-template <typename Torus>
-void host_compute_shifted_blocks_and_states(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
-    int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride,
-    uint32_t lut_count) {
-
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
-  auto luts_array_first_step = mem->luts_array_first_step;
-
-  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array,
-      bsks, ksks, num_blocks, luts_array_first_step, lut_count, lut_stride);
-
-  auto shifted_blocks = mem->shifted_blocks;
-  auto block_states = mem->block_states;
-  cuda_memcpy_async_gpu_to_gpu(block_states, shifted_blocks_and_states,
-                               big_lwe_size_bytes * num_blocks, streams[0],
-                               gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(
-      shifted_blocks, shifted_blocks_and_states + big_lwe_size * num_blocks,
-      big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]);
-}
-
-template <typename Torus>
-void host_resolve_group_carries_sequentially(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *resolved_carries, Torus *grouping_pgns,
-    int_radix_params params, int_seq_group_prop_memory<Torus> *mem,
-    void *const *bsks, Torus *const *ksks, uint32_t num_groups) {
-
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto group_resolved_carries = mem->group_resolved_carries;
-  if (num_groups > 1) {
-    // First carry is just copied
-    cuda_memcpy_async_gpu_to_gpu(resolved_carries + big_lwe_size, grouping_pgns,
-                                 big_lwe_size_bytes, streams[0],
-                                 gpu_indexes[0]);
-    uint32_t solve_per_iter = mem->grouping_size - 1;
-    uint32_t remaining_carries =
-        num_groups -
-        2; // the first one has been resolved and we ignore the last one
-    uint32_t num_loops =
-        ceil(double(remaining_carries) / (double)(solve_per_iter));
-    uint32_t last_resolved_pos = 1;
-
-    for (int i = 0; i < num_loops; i++) {
-      uint32_t loop_offset = i * solve_per_iter;
-      uint32_t blocks_to_solve = solve_per_iter;
-      // In case the last iteration has to solve less
-      if (loop_offset + blocks_to_solve > num_groups - 2) {
-        blocks_to_solve = remaining_carries - loop_offset;
-      }
-
-      // The group_resolved carries is used as an intermediate array
-      // First we need to copy the last resolved carry
-      cuda_memcpy_async_gpu_to_gpu(
-          group_resolved_carries,
-          resolved_carries + last_resolved_pos * big_lwe_size,
-          big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-
-      // The array is filled with the blocks_to_solve
-      cuda_memcpy_async_gpu_to_gpu(
-          group_resolved_carries + big_lwe_size,
-          grouping_pgns + last_resolved_pos * big_lwe_size,
-          blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-
-      // Perform one group cumulative sum
-      host_radix_cumulative_sum_in_groups<Torus>(
-          streams[0], gpu_indexes[0], group_resolved_carries,
-          group_resolved_carries, blocks_to_solve + 1, big_lwe_size,
-          mem->grouping_size);
-
-      // Apply the lut
-      auto luts_sequential = mem->lut_sequential_algorithm;
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count,
-          group_resolved_carries + big_lwe_size,
-          group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve,
-          luts_sequential);
-
-      // Copy the result to the resolved carries array
-      cuda_memcpy_async_gpu_to_gpu(
-          resolved_carries + (last_resolved_pos + 1) * big_lwe_size,
-          group_resolved_carries + big_lwe_size,
-          blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-
-      last_resolved_pos += blocks_to_solve;
-    }
-  }
-}
-
 template <typename Torus>
 void host_compute_prefix_sum_hillis_steele(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -965,95 +707,6 @@ void host_compute_prefix_sum_hillis_steele(
  }
 }

-template <typename Torus>
-void host_compute_propagation_simulators_and_group_carries(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *block_states, int_radix_params params,
-    int_prop_simu_group_carries_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks, uint32_t num_groups) {
-
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-
-  auto propagation_cum_sums = mem->propagation_cum_sums;
-  auto group_size = mem->group_size;
-  host_radix_cumulative_sum_in_groups<Torus>(
-      streams[0], gpu_indexes[0], propagation_cum_sums, block_states,
-      num_blocks, big_lwe_size, group_size);
-
-  auto luts_array_second_step = mem->luts_array_second_step;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, propagation_cum_sums,
-      propagation_cum_sums, bsks, ksks, num_blocks, luts_array_second_step);
-
-  auto scalar_array_cum_sum = mem->scalar_array_cum_sum;
-  auto big_lwe_dimension = big_lwe_size - 1;
-
-  host_integer_radix_scalar_addition_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, propagation_cum_sums,
-      scalar_array_cum_sum, big_lwe_dimension, num_blocks, message_modulus,
-      carry_modulus);
-
-  uint32_t modulus_sup = message_modulus * carry_modulus;
-  Torus delta = (1ull << 63) / modulus_sup;
-  auto simulators = mem->simulators;
-  auto grouping_pgns = mem->grouping_pgns;
-  host_radix_split_simulators_and_grouping_pgns<Torus>(
-      streams[0], gpu_indexes[0], simulators, grouping_pgns,
-      propagation_cum_sums, num_blocks, big_lwe_size, group_size, delta);
-
-  auto resolved_carries = mem->resolved_carries;
-  if (mem->use_sequential_algorithm_to_resolver_group_carries) {
-    // Resolve group carries sequentially
-    host_resolve_group_carries_sequentially(
-        streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns,
-        params, mem->seq_group_prop_mem, bsks, ksks, num_groups);
-  } else {
-    // Resolve group carries with hillis steele
-    auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
-    host_compute_prefix_sum_hillis_steele<Torus>(
-        streams, gpu_indexes, gpu_count, &resolved_carries[big_lwe_size],
-        grouping_pgns, params, luts_carry_propagation_sum, bsks, ksks,
-        num_groups - 1);
-  }
-}
-
-template <typename Torus>
-void host_compute_shifted_blocks_and_borrow_states(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
-    int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride,
-    uint32_t lut_count) {
-
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
-  auto luts_array_first_step = mem->luts_array_first_step;
-
-  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
-      lwe_array, bsks, ksks, num_blocks, luts_array_first_step, lut_count,
-      lut_stride);
-
-  auto shifted_blocks = mem->shifted_blocks;
-  auto borrow_states = mem->borrow_states;
-  cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
-                               big_lwe_size_bytes * num_blocks, streams[0],
-                               gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(
-      shifted_blocks,
-      shifted_blocks_and_borrow_states + big_lwe_size * num_blocks,
-      big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]);
-}
-
 template <typename Torus>
 void host_propagate_single_carry(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
@@ -1539,247 +1192,4 @@ void host_apply_bivariate_lut_kb(
      radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift);
 }

-template <typename Torus>
-void scratch_cuda_fast_propagate_single_carry_kb_inplace(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_fast_sc_prop_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_fast_sc_prop_memory<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      uses_carry, allocate_gpu_memory);
-}
-
-template <typename Torus>
-void host_fast_propagate_single_carry(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array, Torus *carry_out,
-    const Torus *input_carries, int_fast_sc_prop_memory<Torus> *mem,
-    void *const *bsks, Torus *const *ksks, uint32_t num_blocks,
-    uint32_t requested_flag, uint32_t uses_carry) {
-  auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
-  auto lut_stride = mem->lut_stride;
-  auto lut_count = mem->lut_count;
-
-  enum outputFlag { NONE = 0, OVERFLOW = 1, CARRY = 2 };
-  if (uses_carry == 1) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                         input_carries, big_lwe_dimension, 1);
-  }
-
-  host_compute_shifted_blocks_and_states<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array, params,
-      mem->shifted_blocks_state_mem, bsks, ksks, num_blocks, lut_stride,
-      lut_count);
-  auto block_states = mem->shifted_blocks_state_mem->block_states;
-  if (requested_flag == outputFlag::OVERFLOW) {
-    // This operation could be added to the many lut with some trickery to be in
-    // parallel but first i will try to use different streams
-    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, mem->output_flag,
-        lwe_array + (num_blocks - 1) * big_lwe_size, bsks, ksks, 1,
-        lut_overflow_prep);
-  } else if (requested_flag == outputFlag::CARRY) {
-    cuda_memcpy_async_gpu_to_gpu(
-        mem->output_flag, block_states + (num_blocks - 1) * big_lwe_size,
-        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-  }
-
-  host_compute_propagation_simulators_and_group_carries<Torus>(
-      streams, gpu_indexes, gpu_count, block_states, params,
-      mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks,
-      mem->num_groups);
-
-  auto group_size = mem->prop_simu_group_carries_mem->group_size;
-
-  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
-  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
-  host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
-                       shifted_blocks,
-                       mem->prop_simu_group_carries_mem->simulators,
-                       big_lwe_dimension, num_blocks);
-
-  if (requested_flag == outputFlag::OVERFLOW ||
-      requested_flag == outputFlag::CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
-                         mem->output_flag,
-                         mem->prop_simu_group_carries_mem->simulators +
-                             (num_blocks - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
-  }
-
-  for (uint j = 0; j < mem->active_gpu_count; j++) {
-    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-  }
-
-  // Add carries and cleanup OutputFlag::None
-  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
-      mem->prop_simu_group_carries_mem->resolved_carries, num_blocks,
-      big_lwe_size, group_size);
-
-  auto message_extract = mem->lut_message_extract;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
-      bsks, ksks, num_blocks, message_extract);
-
-  if (requested_flag == outputFlag::OVERFLOW ||
-      requested_flag == outputFlag::CARRY) {
-    // Here I could also do some trick to try to apply this function in parallel
-    // First i will try sequential, then i improve it
-
-    host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
-                         mem->output_flag, mem->output_flag,
-                         mem->prop_simu_group_carries_mem->resolved_carries +
-                             (mem->num_groups - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
-
-    if (requested_flag == outputFlag::OVERFLOW) {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
-          mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
-    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
-          mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
-    }
-    for (uint j = 0; j < mem->active_gpu_count; j++) {
-      cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
-                                   big_lwe_size_bytes, mem->sub_streams_2[j],
-                                   gpu_indexes[j]);
-    }
-  }
-
-  for (uint j = 0; j < mem->active_gpu_count; j++) {
-    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
-    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
-  }
-}
-
-template <typename Torus>
-void scratch_cuda_integer_overflowing_sub(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_fast_borrow_prop_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    uint32_t compute_overflow, bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_fast_borrow_prop_memory<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      compute_overflow, allocate_gpu_memory);
-}
-
-template <typename Torus>
-void host_fast_borrow_propagate(cudaStream_t const *streams,
-                                uint32_t const *gpu_indexes, uint32_t gpu_count,
-                                Torus *lhsrhs_array, Torus *overflow_block,
-                                const Torus *input_borrow,
-                                int_fast_borrow_prop_memory<Torus> *mem,
-                                void *const *bsks, Torus *const *ksks,
-                                uint32_t num_blocks, uint32_t num_groups,
-                                uint32_t compute_overflow,
-                                uint32_t uses_input_borrow) {
-  auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  auto big_lwe_dimension = big_lwe_size - 1;
-  auto lut_stride = mem->lut_stride;
-  auto lut_count = mem->lut_count;
-
-  assert(mem->num_groups >= num_groups);
-  if (uses_input_borrow == 1) {
-    host_unchecked_sub_with_correcting_term<Torus>(
-        streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
-        big_lwe_dimension, 1, message_modulus, carry_modulus,
-        message_modulus - 1);
-  }
-
-  host_compute_shifted_blocks_and_borrow_states<Torus>(
-      streams, gpu_indexes, gpu_count, lhsrhs_array, params,
-      mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_blocks, lut_stride,
-      lut_count);
-
-  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
-  cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
-                               borrow_states + (num_blocks - 1) * big_lwe_size,
-                               big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-
-  host_compute_propagation_simulators_and_group_carries<Torus>(
-      streams, gpu_indexes, gpu_count, borrow_states, params,
-      mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks, num_groups);
-
-  auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
-  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
-  auto simulators = mem->prop_simu_group_carries_mem->simulators;
-
-  host_subtraction<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
-                          shifted_blocks, simulators, big_lwe_dimension,
-                          num_blocks);
-
-  // unchecked_scalar_add_ssing
-  host_integer_radix_add_scalar_one_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
-      num_blocks, message_modulus, carry_modulus);
-
-  // unchecked_add_assing in overflow_block
-  if (compute_overflow == 1) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block,
-                         mem->prop_simu_group_carries_mem->simulators +
-                             (num_blocks - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
-  }
-  auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries;
-
-  // This needs to be done before because in next step we modify the resolved
-  // borrows
-  if (compute_overflow == 1) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block,
-                         resolved_borrows + (num_groups - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
-  }
-
-  cudaEventRecord(mem->incoming_events[0], streams[0]);
-  cudaStreamWaitEvent(mem->sub_streams_1[0], mem->incoming_events[0], 0);
-  cudaStreamWaitEvent(mem->sub_streams_2[0], mem->incoming_events[0], 0);
-
-  if (compute_overflow == 1) {
-    auto borrow_flag = mem->lut_borrow_flag;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
-        mem->overflow_block, bsks, ksks, 1, borrow_flag);
-  }
-  cudaEventRecord(mem->outgoing_events1[0], mem->sub_streams_1[0]);
-  // subtract borrow and cleanup prepared blocks
-  host_negation<Torus>(mem->sub_streams_2[0], gpu_indexes[0], resolved_borrows,
-                       resolved_borrows, big_lwe_dimension, num_groups);
-
-  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
-      resolved_borrows, num_blocks, big_lwe_size, mem->group_size);
-
-  auto message_extract = mem->lut_message_extract;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks,
-      bsks, ksks, num_blocks, message_extract);
-
-  cudaEventRecord(mem->outgoing_events2[0], mem->sub_streams_2[0]);
-
-  cudaStreamWaitEvent(streams[0], mem->outgoing_events1[0], 0);
-  cudaStreamWaitEvent(streams[0], mem->outgoing_events2[0], 0);
-}
-
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -67,11 +67,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
 */
 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          polynomial_size * glwe_dimension, lwe_dimension,
@@ -88,8 +89,8 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
  case 16384:
    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
        (cudaStream_t const *)(streams), gpu_indexes, gpu_count,
-        (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
-        allocate_gpu_memory);
+        (int_mul_memory<uint64_t> **)mem_ptr, is_boolean_left, is_boolean_right,
+        num_radix_blocks, params, allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -126,65 +127,66 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 */
 void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void const *radix_lwe_left,
-    void const *radix_lwe_right, void *const *bsks, void *const *ksks,
-    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
+    void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
+    void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
+    void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks) {

  switch (polynomial_size) {
  case 256:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 512:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 1024:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 4096:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 8192:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 16384:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<const uint64_t *>(radix_lwe_left),
-        static_cast<const uint64_t *>(radix_lwe_right), bsks,
+        static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
+        static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
        (uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -9,6 +9,7 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "helper_multi_gpu.h"
+#include "integer/cmux.cuh"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
 #include "linear_algebra.h"
@@ -453,7 +454,8 @@ template <typename Torus, class params>
 __host__ void host_integer_mult_radix_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
-    uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
+    bool const is_bool_left, uint64_t const *radix_lwe_right,
+    bool const is_bool_right, void *const *bsks, uint64_t *const *ksks,
    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

  auto glwe_dimension = mem_ptr->params.glwe_dimension;
@@ -464,6 +466,20 @@ __host__ void host_integer_mult_radix_kb(

  int big_lwe_dimension = glwe_dimension * polynomial_size;

+  if (is_bool_right) {
+    zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
+                       radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
+                       mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
+    return;
+  }
+
+  if (is_bool_left) {
+    zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
+                       radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
+                       mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
+    return;
+  }
+
  // 'vector_result_lsb' contains blocks from all possible right shifts of
  // radix_lwe_left, only nonzero blocks are kept
  int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
@@ -562,30 +578,21 @@ __host__ void host_integer_mult_radix_kb(
      terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
      2 * num_blocks, mem_ptr->luts_array);

-  uint32_t block_modulus = message_modulus * carry_modulus;
-  uint32_t num_bits_in_block = std::log2(block_modulus);
-  // if (num_blocks < num_bits_in_block) {
-  //   auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
-  //   host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-  //                                      radix_lwe_out, nullptr, nullptr,
-  //                                      scp_mem_ptr, bsks, ksks, num_blocks);
-  // } else {
-  auto fast_scp_mem_ptr = mem_ptr->fast_sc_prop_mem;
-  uint32_t requested_flag = 0;
-  uint32_t uses_carry = 0;
-  host_fast_propagate_single_carry<Torus>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
-      fast_scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
-  //}
+  auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
+  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
+                                     radix_lwe_out, nullptr, nullptr,
+                                     scp_mem_ptr, bsks, ksks, num_blocks);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
+    bool const is_boolean_left, bool const is_boolean_right,
    uint32_t num_radix_blocks, int_radix_params params,
    bool allocate_gpu_memory) {
  *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                       is_boolean_left, is_boolean_right,
                                       num_radix_blocks, allocate_gpu_memory);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -38,7 +38,15 @@ void cuda_integer_radix_overflowing_sub_kb_64(
    void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks, uint32_t num_blocks) {

-  // auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
+  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
+
+  host_integer_overflowing_sub_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(radix_lwe_out),
+      static_cast<uint64_t *>(radix_lwe_overflowed),
+      static_cast<const uint64_t *>(radix_lwe_left),
+      static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+      mem, num_blocks);
 }

 void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }
-/*
+
 template <typename Torus>
 __host__ void host_integer_overflowing_sub_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -113,39 +113,4 @@ __host__ void host_integer_overflowing_sub_kb(
                                          mem_ptr, bsks, ksks, num_blocks);
 }

-*/
-template <typename Torus>
-__host__ void host_integer_overflowing_sub(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
-    const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
-    int_fast_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
-    uint32_t uses_input_borrow) {
-
-  auto radix_params = mem_ptr->params;
-
-  // We need to recalculate the num_groups, because on the division the number
-  // of num_blocks changes
-  uint32_t block_modulus =
-      radix_params.message_modulus * radix_params.carry_modulus;
-  uint32_t num_bits_in_block = std::log2(block_modulus);
-  uint32_t grouping_size = num_bits_in_block;
-  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
-
-  auto stream = (cudaStream_t *)streams;
-  host_unchecked_sub_with_correcting_term<Torus>(
-      stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
-      static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
-      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
-      radix_params.carry_modulus, radix_params.message_modulus - 1);
-
-  host_fast_borrow_propagate<Torus>(
-      streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
-      static_cast<Torus *>(overflow_block),
-      static_cast<const Torus *>(input_borrow),
-      (int_fast_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
-      num_blocks, num_groups, compute_overflow, uses_input_borrow);
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -112,24 +112,10 @@ __host__ void host_integer_scalar_mul_radix(
        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
        num_radix_blocks, j, nullptr);

-    // uint32_t carry_modulus = message_modulus;
-    // uint32_t block_modulus = message_modulus * carry_modulus;
-    // uint32_t num_bits_in_block = std::log2(block_modulus);
-    // if (num_radix_blocks < num_bits_in_block) {
-    //   auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
-    //   host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count,
-    //   lwe_array,
-    //                                  nullptr, nullptr, scp_mem_ptr, bsks,
-    //                                  ksks, num_radix_blocks);
-    // } else {
-    auto fast_scp_mem_ptr = mem->fast_sc_prop_mem;
-    uint32_t requested_flag = 0;
-    uint32_t uses_carry = 0;
-    host_fast_propagate_single_carry<T>(
-        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
-        fast_scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag,
-        uses_carry);
-    //}
+    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
+    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
+                                   nullptr, nullptr, scp_mem_ptr, bsks, ksks,
+                                   num_radix_blocks);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -57,27 +57,6 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                          static_cast<const uint64_t *>(lwe_array_in_2),
                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
-// last block it is the packing lhs*message_modulus + rhs
-void cuda_add_lwe_ciphertext_vector_64_with_packing(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in_1, void const *lwe_array_in_2,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus) {
-
-  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<const uint64_t *>(lwe_array_in_1),
-                          static_cast<const uint64_t *>(lwe_array_in_2),
-                          input_lwe_dimension, input_lwe_ciphertext_count - 1);
-
-  host_pack_for_overflowing_ops<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in_1),
-      static_cast<const uint64_t *>(lwe_array_in_2), input_lwe_dimension,
-      input_lwe_ciphertext_count, message_modulus);
-}
-
 /*
 * Perform the addition of a u32 input LWE ciphertext vector with a u32
 * plaintext vector. See the equivalent operation on u64 data for more details.
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -82,45 +82,6 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
  check_cuda_error(cudaGetLastError());
 }

-template <typename T>
-__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
-                                         T const *input_2, uint32_t num_entries,
-                                         uint32_t message_modulus) {
-
-  int tid = threadIdx.x;
-  int index = blockIdx.x * blockDim.x + tid;
-  if (index < num_entries) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output[index] = input_1[index] * message_modulus + input_2[index];
-  }
-}
-
-template <typename T>
-__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
-                                            uint32_t gpu_index, T *output,
-                                            T const *input_1, T const *input_2,
-                                            uint32_t input_lwe_dimension,
-                                            uint32_t input_lwe_ciphertext_count,
-                                            uint32_t message_modulus) {
-
-  cudaSetDevice(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
-      &output[input_lwe_ciphertext_count - 1],
-      &input_1[input_lwe_ciphertext_count - 1],
-      &input_2[input_lwe_ciphertext_count - 1], lwe_size, message_modulus);
-  check_cuda_error(cudaGetLastError());
-}
-
 template <typename T>
 __global__ void subtraction(T *output, T const *input_1, T const *input_2,
                            uint32_t num_entries) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -24,8 +24,8 @@ __device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
                                           uint32_t level_count) {
  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
+              (level_count - level - 1) * polynomial_size / 2 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }

@@ -35,8 +35,8 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                     int glwe_dimension, uint32_t level_count) {
  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
+              (level_count - level - 1) * polynomial_size / 2 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }
 template <typename T>
@@ -45,8 +45,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     int glwe_dimension, uint32_t level_count) {
  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
+              (level_count - level - 1) * polynomial_size / 2 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1) +
              glwe_dimension * polynomial_size / 2];
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -7,6 +7,7 @@
 #include "fft/bnsmfft.cuh"
 #include "helper_multi_gpu.h"
 #include "pbs/programmable_bootstrap_multibit.h"
+#include "polynomial/polynomial_math.cuh"

 using namespace cooperative_groups;
 namespace cg = cooperative_groups;
@@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
                        double2 *global_memory_buffer, uint32_t polynomial_size,
                        uint32_t glwe_dimension, bool support_dsm);

-template <typename Torus, typename G, class params>
+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension x relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
 __device__ void
-mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
-              const double2 *__restrict__ bootstrapping_key,
-              int polynomial_size, uint32_t glwe_dimension, int level_count,
-              int iteration, G &group, bool support_dsm = false) {
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  // Get the pieces of the bootstrapping key that will be needed for the
-  // external product; blockIdx.x is the ID of the block that's executing
-  // this function, so we end up getting the lines of the bootstrapping key
-  // needed to perform the external product in this block (corresponding to
-  // the same decomposition level)
-  auto bsk_slice = get_ith_mask_kth_block(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
-      glwe_dimension, level_count);
-
-  // Perform the matrix multiplication between the GGSW and the GLWE,
-  // each block operating on a single level for mask and body
+mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
+                                const double2 *__restrict__ bootstrapping_key,
+                                int iteration, G &group,
+                                bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.x;

  // The first product is used to initialize level_join_buffer
-  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-  auto buffer_slice =
-      get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
-                                 polynomial_size, glwe_dimension, support_dsm);
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
-    tid += params::degree / params::opt;
-  }
-
-  group.sync();

  // Continues multiplying fft by every polynomial in that particular bsk level
  // Each y-block accumulates in a different polynomial at each iteration
-  for (int j = 1; j < (glwe_dimension + 1); j++) {
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
    int idx = (j + this_block_rank) % (glwe_dimension + 1);

-    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2;
    auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
                                                   join_buffer, polynomial_size,
                                                   glwe_dimension, support_dsm);

-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
-      tid += params::degree / params::opt;
-    }
+    polynomial_product_accumulate_in_fourier_domain<params, double2>(
+        buffer_slice, fft, bsk_poly, j == 0);
    group.sync();
  }

@@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
  // All blocks are synchronized here; after this sync, level_join_buffer has
  // the values needed from every other block

-  auto src_acc =
-      get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
-                                 polynomial_size, glwe_dimension, support_dsm);
-
-  // copy first product into fft buffer
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = src_acc[tid];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
  // accumulate rest of the products into fft buffer
-  for (int l = 1; l < gridDim.x; l++) {
+  for (int l = 0; l < level_count; l++) {
    auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
                                                  join_buffer, polynomial_size,
                                                  glwe_dimension, support_dsm);
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] += cur_src_acc[tid];
-      tid += params::degree / params::opt;
-    }
+
+    polynomial_accumulate_in_fourier_domain<params>(fft, cur_src_acc, l == 0);
  }

  synchronize_threads_in_block();
-
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  add_to_torus<Torus, params>(fft, accumulator);
-
-  __syncthreads();
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -117,8 +117,8 @@ __global__ void device_programmable_bootstrap_amortized(

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
+    init_decomposer_state_inplace<Torus, params::opt,
+                                  params::degree / params::opt>(
        accumulator_rotated, base_log, level_count, glwe_dimension + 1);

    // Initialize the polynomial multiplication via FFT arrays
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -117,8 +117,8 @@ __global__ void device_programmable_bootstrap_cg(

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
+    init_decomposer_state_inplace<Torus, params::opt,
+                                  params::degree / params::opt>(
        accumulator_rotated, base_log, level_count);

    synchronize_threads_in_block();
@@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg(
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, grid_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
+    mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
+        accumulator_fft, block_join_buffer, bootstrapping_key, i, grid);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator);
  }

  auto block_lwe_array_out =
@@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg(
                         (glwe_dimension * polynomial_size + 1) +
                     blockIdx.y * polynomial_size];

-  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-    // Perform a sample extract. At this point, all blocks have the result, but
-    // we do the computation at block 0 to avoid waiting for extra blocks, in
-    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
+  if (blockIdx.x == 0) {
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];

-        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 1, i * lut_stride);
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
      }
-    }
-  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {

-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];

-        sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 0, i * lut_stride);
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
      }
    }
  }
@@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg(
  uint64_t partial_dm = full_dm - partial_sm;

  int8_t *d_mem = buffer->d_mem;
-  double2 *buffer_fft = buffer->global_accumulator_fft;
+  double2 *buffer_fft = buffer->global_join_buffer;

  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
        uint32_t lut_count, uint32_t lut_stride) {
-
  grid_group grid = this_grid();

  // We use shared memory for the polynomials that are used often during the
@@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    selected_memory = &device_mem[block_index * device_memory_size_per_block];
  }

-  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated = (Torus *)selected_memory;
  double2 *accumulator_fft =
-      (double2 *)accumulator +
+      (double2 *)accumulator_rotated +
      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));

  if constexpr (SMD == PARTIALSM)
@@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
                   params::degree / 2];

-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+  Torus *global_accumulator_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];

-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.z * keybundle_size_per_input];

  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
@@ -87,92 +85,95 @@ __global__ void __launch_bounds__(params::degree / params::opt)

    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
+        b_hat, false);
  } else {
-    // Load the accumulator calculated in previous iterations
+    // Load the accumulator_rotated calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
+        global_accumulator_slice, accumulator_rotated);
  }

  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
-        accumulator, base_log, level_count);
+    init_decomposer_state_inplace<Torus, params::opt,
+                                  params::degree / params::opt>(
+        accumulator_rotated, base_log, level_count);

-    // Decompose the accumulator. Each block gets one level of the
+    // Decompose the accumulator_rotated. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
-    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    // accumulator_rotated decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, grid_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, keybundle,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
+    mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
+        accumulator_fft, block_join_buffer, keybundle, i, grid);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
  }

-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
-    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                           (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
+  auto accumulator = accumulator_rotated;

-    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      // Always extract one by default
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  if (blockIdx.x == 0) {
+    if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+      auto block_lwe_array_out =
+          &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                             (glwe_dimension * polynomial_size + 1) +
+                         blockIdx.y * polynomial_size];

-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
+      if (blockIdx.y < glwe_dimension) {
+        // Perform a sample extract. At this point, all blocks have the result,
+        // but we do the computation at block 0 to avoid waiting for extra
+        // blocks, in case they're not synchronized Always extract one by
+        // default
+        sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);

-          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 1, i * lut_stride);
-        }
-      }
-
-    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
-
-          sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 0, i * lut_stride);
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 1, i * lut_stride);
+          }
+        }
+
+      } else if (blockIdx.y == glwe_dimension) {
+
+        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 0, i * lut_stride);
+          }
        }
      }
+    } else {
+      // Load the accumulator calculated in previous iterations
+      copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+          accumulator, global_accumulator_slice);
    }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
  }
 }

@@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop(
    uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
    uint32_t lut_stride) {

-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-  uint64_t full_dm =
+  uint64_t full_sm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
-  uint64_t partial_dm =
+  uint64_t partial_sm =
      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
+
+  auto full_dm = full_sm;
+  auto partial_dm = full_sm - partial_sm;
  uint64_t no_dm = 0;

+  auto lwe_chunk_size = buffer->lwe_chunk_size;
  int max_shared_memory = cuda_get_max_shared_memory(0);
  cudaSetDevice(gpu_index);

@@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;

  auto d_mem = buffer->d_mem_acc_cg;
  auto keybundle_fft = buffer->keybundle_fft;
  auto global_accumulator = buffer->global_accumulator;
-  auto buffer_fft = buffer->global_accumulator_fft;
+  auto join_buffer = buffer->global_join_buffer;

  void *kernel_args[22];
  kernel_args[0] = &lwe_array_out;
@@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop(
  kernel_args[4] = &lwe_array_in;
  kernel_args[5] = &lwe_input_indexes;
  kernel_args[6] = &keybundle_fft;
-  kernel_args[7] = &buffer_fft;
+  kernel_args[7] = &join_buffer;
  kernel_args[8] = &global_accumulator;
  kernel_args[9] = &lwe_dimension;
  kernel_args[10] = &glwe_dimension;
@@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop(
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
            Torus, params, PARTIALSM>,
-        grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
+        grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
  } else {
    kernel_args[19] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
            Torus, params, FULLSM>,
-        grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
+        grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -654,8 +654,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
-  if (base_log > 32)
-    PANIC("Cuda error (classical PBS): base log should be <= 32")
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")

  pbs_buffer<uint64_t, CLASSICAL> *buffer =
      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        const Torus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
        const double2 *__restrict__ bootstrapping_key,
-        Torus *global_accumulator, double2 *global_accumulator_fft,
+        Torus *global_accumulator, double2 *global_join_buffer,
        uint32_t lwe_iteration, uint32_t lwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
        int8_t *device_mem, uint64_t device_memory_size_per_block) {
@@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;

  double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
+      global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+                            blockIdx.z * level_count * (glwe_dimension + 1)) *
+                               (polynomial_size / 2);

  if (lwe_iteration == 0) {
    // First iteration
@@ -107,8 +106,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  // Perform a rounding to increase the accuracy of the
  // bootstrapped ciphertext
-  round_to_closest_multiple_inplace<Torus, params::opt,
-                                    params::degree / params::opt>(
+  init_decomposer_state_inplace<Torus, params::opt,
+                                params::degree / params::opt>(
      accumulator, base_log, level_count);

  synchronize_threads_in_block();
@@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes,
        const double2 *__restrict__ bootstrapping_key,
-        Torus *global_accumulator, double2 *global_accumulator_fft,
+        Torus *global_accumulator, double2 *global_join_buffer,
        uint32_t lwe_iteration, uint32_t lwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
@@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    accumulator_fft = (double2 *)sharedmem;

  for (int level = 0; level < level_count; level++) {
-    double2 *global_fft_slice = global_accumulator_fft +
-                                (level + blockIdx.x * level_count) *
-                                    (glwe_dimension + 1) * (params::degree / 2);
+    double2 *global_fft_slice =
+        global_join_buffer + (level + blockIdx.x * level_count) *
+                                 (glwe_dimension + 1) * (params::degree / 2);

    for (int j = 0; j < (glwe_dimension + 1); j++) {
      double2 *fft = global_fft_slice + j * params::degree / 2;
@@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
  }
  // Otherwise, both kernels run all in shared memory
  uint64_t buffer_size = device_mem +
-                         // global_accumulator_fft
+                         // global_join_buffer
                         (glwe_dimension + 1) * level_count *
                             input_lwe_ciphertext_count *
                             (polynomial_size / 2) * sizeof(double2) +
@@ -368,7 +367,7 @@ __host__ void execute_step_one(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
+    Torus *global_accumulator, double2 *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -383,21 +382,21 @@ __host__ void execute_step_one(
    device_programmable_bootstrap_step_one<Torus, params, NOSM>
        <<<grid, thds, 0, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, full_dm);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, partial_dm);
  } else {
    device_programmable_bootstrap_step_one<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, 0);
  }
@@ -409,7 +408,7 @@ __host__ void execute_step_two(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
    Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
+    Torus *global_accumulator, double2 *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -425,21 +424,21 @@ __host__ void execute_step_two(
    device_programmable_bootstrap_step_two<Torus, params, NOSM>
        <<<grid, thds, 0, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, full_dm, lut_count, lut_stride);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, partial_dm, lut_count, lut_stride);
  } else {
    device_programmable_bootstrap_step_two<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, 0, lut_count, lut_stride);
  }
@@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap(
  uint64_t full_dm_step_two = full_sm_step_two;

  Torus *global_accumulator = pbs_buffer->global_accumulator;
-  double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
+  double2 *global_join_buffer = pbs_buffer->global_join_buffer;
  int8_t *d_mem = pbs_buffer->d_mem;

  for (int i = 0; i < lwe_dimension; i++) {
    execute_step_one<Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, global_accumulator,
-        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
    execute_step_two<Torus, params>(
        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
        lut_vector_indexes, bootstrapping_key, global_accumulator,
-        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
        lut_count, lut_stride);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Beka Barbakadze	d45cb74476	fix precision	2025-02-05 20:00:20 +04:00
Beka Barbakadze	772c049681	fix some bugs	2025-02-03 16:02:57 +04:00
Beka Barbakadze	516ae67990	feat(gpu): Implement fft128 in cuda backend	2025-01-20 15:43:19 +04:00
dependabot[bot]	db61b0bb9b	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.16 to 3.0.17. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`38608ef4fb...5d6ac37a4c`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-25 11:06:28 +01:00
dependabot[bot]	dc8091ad0f	chore(deps): bump actions/upload-artifact from 3.1.2 to 4.4.3 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3.1.2 to 4.4.3. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v3.1.2...b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-25 11:06:19 +01:00
dependabot[bot]	3ccfb9616a	chore(deps): bump zama-ai/slab-github-runner from 1.2.0 to 1.3.0 Bumps [zama-ai/slab-github-runner](https://github.com/zama-ai/slab-github-runner) from 1.2.0 to 1.3.0. - [Release notes](https://github.com/zama-ai/slab-github-runner/releases) - [Commits](https://github.com/zama-ai/slab-github-runner/compare/v1.2.0...98f0788261a7323d5d695a883e20df36591a92b7) --- updated-dependencies: - dependency-name: zama-ai/slab-github-runner dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-25 11:06:12 +01:00
dependabot[bot]	83dc9b9453	chore(deps): bump dtolnay/rust-toolchain Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from 7b1c307e0dcbda6122208f10795a713336a9b35a to 315e265cd78dad1e1dcf3a5074f6d6c47029d5aa. - [Release notes](https://github.com/dtolnay/rust-toolchain/releases) - [Commits](`7b1c307e0d...315e265cd7`) --- updated-dependencies: - dependency-name: dtolnay/rust-toolchain dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-25 11:06:04 +01:00
dependabot[bot]	4fe72a15c0	chore(deps): bump rtCamp/action-slack-notify from 2.2.1 to 2.3.2 Bumps [rtCamp/action-slack-notify](https://github.com/rtcamp/action-slack-notify) from 2.2.1 to 2.3.2. - [Release notes](https://github.com/rtcamp/action-slack-notify/releases) - [Commits](https://github.com/rtcamp/action-slack-notify/compare/v2.2.1...c33737706dea87cd7784c687dadc9adf1be59990) --- updated-dependencies: - dependency-name: rtCamp/action-slack-notify dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-25 11:05:56 +01:00
David Testé	2a18d6fa32	chore(ci): fix gpu integer benchmarks ops flavor parsing	2024-11-22 15:43:32 +01:00
David Testé	8c2358a9e1	chore(ci): refacto erc20 gpu bench workflows to reduce duplicates Now there is only one entry point to trigger ERC20 benchmarks manually. This entry point uses a sub-workflow responsible for provisioning and running the benchmarks. A weekly workflow is also created with all the targets needed.	2024-11-22 15:43:32 +01:00
David Testé	c3def17ad8	chore(ci): fix gpu integer benchmark workflow	2024-11-22 11:25:18 +01:00
David Testé	9da58f68c7	chore(ci): refacto gpu bench workflows to reduce duplicates Now there is only one entry point to trigger benchmarks manually. This entry point uses a sub-workflow responsible for provisioning and running the benchmarks. A weekly workflow is also created with all the targets needed. This also adds the possibility to run throughput benchmarks on-demand.	2024-11-21 17:05:49 +01:00
David Testé	5c226e98ba	chore(bench): rename env var to handle multi-bit parameters set	2024-11-21 14:30:20 +01:00
Beka Barbakadze	27ccfbd939	feat(gpu): optimize integer mul when one of the ct holds boolean	2024-11-21 14:28:06 +01:00
Arthur Meyre	40dd2a6ecc	chore(fft): make bench naming easier to read	2024-11-21 10:47:48 +01:00
Arthur Meyre	2d9c13569f	chore: rename concrete-csprng to tfhe-csprng	2024-11-21 10:47:48 +01:00
Arthur Meyre	36deaec607	chore(ntt): bring concrete-ntt in the repo as tfhe-ntt	2024-11-21 10:47:48 +01:00
tmontaigu	fcc0378c98	fix(hlapi): rework CompressedCiphertextListBuilder The hlapi builder target device was selected depending on features (gpu enabled ? gpu : cpu), but if at `build` time the thread_local key did not match the expected device, an error would be returned. This is a bit too limiting for users that might want to do some processing on GPU and compression on CPU. So the Builder is changed to delay, the selection of device used to compress when `build` is called. This new design is more flexible for end users, at the cost of a bit more memory copies * There should be no API breaking change * There is no serialization breaking change as only the builder (which is not serializable) has been changed	2024-11-21 10:46:18 +01:00
David Testé	b31fbf5f23	chore(bench): fix result parsing for object and key generation	2024-11-20 11:55:22 +01:00
David Testé	b5c614520a	chore(ci): fix recipe for integer benchmarks on gpu	2024-11-20 11:55:22 +01:00
Mayeul@Zama	46cf465637	refactor(strings): comparisons take a GenericPattern	2024-11-20 09:54:15 +01:00
Mayeul@Zama	11a0fe2b40	chore(strings): support multi bit params in test	2024-11-20 09:54:15 +01:00
Mayeul@Zama	7dcb5bd4a6	chore(strings): add GenericPatternRef	2024-11-20 09:54:15 +01:00
Mayeul@Zama	55a112cca5	chore(strings): update Makefile for strings	2024-11-20 09:54:15 +01:00
Mayeul@Zama	992c062db0	chore(strings): add strings tests to CI	2024-11-20 09:54:15 +01:00
Mayeul@Zama	58f5a2c593	chore(strings): add print_trivial	2024-11-20 09:54:15 +01:00
Mayeul@Zama	14c10c374e	test(strings): rename tests	2024-11-20 09:54:15 +01:00
Mayeul@Zama	0d202e6e03	test(strings): use trivial encryption in most tests	2024-11-20 09:54:15 +01:00
Mayeul@Zama	4aaa3b67d6	chore(strings): add trivial_encrypt	2024-11-20 09:54:15 +01:00
Mayeul@Zama	609e24bf7c	chore(shortint): add trivial encrypt to client key	2024-11-20 09:54:15 +01:00
Mayeul@Zama	5cd5fbe1f2	chore(strings): use keycache in tests	2024-11-20 09:54:15 +01:00
Mayeul@Zama	089efd7b17	chore(strings): split tests and increase coverage	2024-11-20 09:54:15 +01:00
Mayeul@Zama	a582aadd5d	test(strings): rename test module	2024-11-20 09:54:15 +01:00
Mayeul@Zama	19d0a3d8c3	fix(strings): fix empty strings conversion to uint	2024-11-20 09:54:15 +01:00
Mayeul@Zama	af49b99724	fix(integer): fix overflow	2024-11-20 09:54:15 +01:00
Mayeul@Zama	cf713821da	fix(strings): fix underflow	2024-11-20 09:54:15 +01:00
Mayeul@Zama	a5fb99ee36	fix(strings): fix padding issue	2024-11-20 09:54:15 +01:00
Beka Barbakadze	ac1284679e	feat(gpu): Implement signed division in cuda backend	2024-11-20 09:07:38 +01:00
David Testé	9059ddeacc	chore(bench): add throughput benchmarks to suite All integer benchmarks make recipes can be run to ouput throughput results. Only CPU is supported for throughput benchmarks in GitHub CI.	2024-11-19 12:07:50 +01:00
Arthur Meyre	904ffa729b	chore(ci): do not run fft workflows on push to main	2024-11-19 10:43:54 +01:00
Arthur Meyre	c9b4ee84ae	chore(ci): fix fft bench parser	2024-11-19 10:43:23 +01:00
Arthur Meyre	d56e7e0b2a	chore(bench): fix fft bench again	2024-11-18 16:56:29 +01:00
Arthur Meyre	6d2206e5ac	chore(bench): fix fft bench	2024-11-18 15:37:53 +01:00
Mayeul@Zama	015b11d309	chore(test): lower p_value_limit to decrease test failure probability	2024-11-18 15:15:33 +01:00
dependabot[bot]	e390e8eb5a	chore(deps): bump codecov/codecov-action from 4.6.0 to 5.0.2 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 4.6.0 to 5.0.2. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`b9fd7d16f6...5c47607acb`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-18 13:27:43 +01:00
Arthur Meyre	6a161fef0a	chore: bring concrete-fft as tfhe-fft in the repo	2024-11-18 13:17:58 +01:00
Arthur Meyre	9fbd96f016	chore(ci): remove outdated forward compat feature	2024-11-18 13:17:58 +01:00
Nicolas Sarlin	a45b7b3974	chore(zk): add benches to tfhe-zk-pok	2024-11-18 13:17:28 +01:00
Nicolas Sarlin	e59a680407	fix(core): fix compact pke with single lwe	2024-11-15 17:08:22 +01:00
Mayeul@Zama	cf7968ac6c	chore(ci): fix bash script	2024-11-15 14:14:04 +01:00
Agnes Leroy	7aa454ee97	chore(gpu): update asserts on base log now that we don't cast to u32 in decomposition	2024-11-15 13:24:52 +01:00
Beka Barbakadze	0aee4c568e	feat(gpu): add abs operation on gpu backend	2024-11-15 13:24:13 +01:00
tmontaigu	f9e8df49d2	chore: add parameters getters for CompactPublicKey types	2024-11-14 19:08:39 +01:00
Mayeul@Zama	cf56e5853f	chore(ci): fix OOM when linking c_api tests	2024-11-14 15:56:56 +01:00
Mayeul@Zama	b2e8ef6010	chore(ci): use cpu_count.sh where possible	2024-11-14 15:56:56 +01:00
tmontaigu	bb327b09ae	feat(capi): add mechanism to get panic message as const char * Previously, when an error occurred in the rust side, the panic message would get printed to stderr, then the c function would return 1 to indicate error. This commit adds the ability to disable the automatic prints of panic messages and adds functions to get the panic message as a const char * to allow user better control on how to display error messages.	2024-11-14 15:24:29 +01:00
Pedro Alves	5a664aa30d	chore(gpu): simplifications to the zero_out_if method	2024-11-13 15:23:04 -03:00
Agnes Leroy	4264ba2e20	chore(gpu): remove 3_3 group 2 tests to gain time in the ci	2024-11-13 16:14:45 +01:00
Guillermo Oyarzun	b18aa0df54	fix(gpu): fix signed overflowing sub for one block case	2024-11-13 15:20:22 +01:00
Agnes Leroy	a501285206	chore(gpu): change target for multi-gpu tests	2024-11-13 15:06:46 +01:00
Arthur Meyre	d28040342c	chore(gpu): use same balanced decomposition code as in the CPU code	2024-11-13 14:26:13 +01:00
Pedro Alves	b041608d25	fix(gpu): general fixes and improvements to PBS - update pbs test parameters to match tfhe-rs' integer tests - refactor mul_ggsw_glwe to make it easier to read - fix the way we accumulate the external product result on multi-bit PBS	2024-11-13 13:36:55 +01:00
Arthur Meyre	eac30027e9	chore(ci): run bench profile as ubuntu	2024-11-12 16:59:38 +01:00
dependabot[bot]	aaba7e5916	chore(deps): bump tj-actions/changed-files from 45.0.3 to 45.0.4 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.3 to 45.0.4. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`c3a1bb2c99...4edd678ac3`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-12 14:16:30 +01:00
Agnes Leroy	d29ed6b60c	chore(gpu): trigger GPU tests if tests are modified	2024-11-12 09:12:25 +01:00
Arthur Meyre	9ee18dd2c7	test: add tag check for parameter keyswitch in HL API	2024-11-08 18:03:01 +01:00
Nicolas Sarlin	6ef22e8cb9	refactor(zk)!: directly use the `CompactPkeCrs` in all public APIs BREAKING_CHANGE: - All the zk API (build_with_proof, verify, verify_and_expand,...) now take a `CompactPkeCrs` instead of a `CompactPkePublicParams`. Serialized `CompactPkePublicParams` from previous versions can be converted into a `CompactPkeCrs` using `params.into()`	2024-11-08 17:50:34 +01:00
Nicolas Sarlin	fa7a6281ad	chore(tfhe): prepare release 0.11.0	2024-11-08 17:50:34 +01:00
Agnes Leroy	5c189d6bf3	chore(ci): use function executor for abs and signed div tests	2024-11-07 17:17:00 +01:00
Nicolas Sarlin	f8bde7fbde	fix(zk): fix build with feature zk-pok without shortint	2024-11-07 16:06:26 +01:00
Nicolas Sarlin	f9c4627946	doc: update the doc with the new msrv	2024-11-07 14:58:37 +01:00
Nicolas Sarlin	5dd6d8d569	chore(ci): enable safe_serialization tests	2024-11-07 13:37:31 +01:00
Nicolas Sarlin	5e3b793fd7	feat(zk): add conformance for zk proof and crs	2024-11-07 09:33:16 +01:00
Nicolas Sarlin	295b6608ee	feat(zk): check that proof and crs points are valid	2024-11-07 09:33:16 +01:00
tmontaigu	5c42fc950e	chore: make more add/sub test use variable num_blocks	2024-11-06 16:43:01 +01:00
Mayeul@Zama	ff6e9cab63	refactor(string): use custom iterator to avoid allocation	2024-11-06 14:44:09 +01:00
Mayeul@Zama	e88222987a	chore(fhe_strings): limit max n to improve performance	2024-11-06 14:44:09 +01:00
Mayeul@Zama	bcae0f1beb	fix(strings): fix underflow	2024-11-06 14:44:09 +01:00
Mayeul@Zama	a6a5716e37	chore(strings): use is_empty function	2024-11-06 14:44:09 +01:00
Mayeul@Zama	829b00bb6d	chore(strings): cleanup function	2024-11-06 14:44:09 +01:00
Mayeul@Zama	de1cc0a863	feat(strings): add support for custom params	2024-11-06 14:44:09 +01:00
Mayeul@Zama	69b6c3a353	refactor(strings): move test_all function in separate module	2024-11-06 14:44:09 +01:00
Mayeul@Zama	2fcde61e98	refactor(strings): use integer keys	2024-11-06 14:44:09 +01:00
Mayeul@Zama	c22f6ff70e	fix(strings): fix clippy lints	2024-11-06 14:44:09 +01:00
Mayeul@Zama	fcf7e66d43	chore(strings): cleanup	2024-11-06 14:44:09 +01:00
Mayeul@Zama	fc28ea5a30	fix(strings): fixes after strings move	2024-11-06 14:44:09 +01:00
Mayeul@Zama	8680e1de0a	refactor(strings): move fhe_strings from examples to strings module	2024-11-06 14:44:09 +01:00
Nicolas Sarlin	daf57f5665	chore(zk): update arkworks to 0.5.0	2024-11-06 11:53:34 +01:00
Nicolas Sarlin	ccf0dc3ad8	fix(zk): fix zk wasm x86_64 tests	2024-11-06 11:17:06 +01:00
Agnes Leroy	ba5e717183	chore(gpu): add workflows for erc20 with 2 and 8 H100	2024-11-06 09:38:05 +01:00
Arthur Meyre	615ed3d5db	refactor(tfhe)!: update key level order for better performance - use natural order for decomposition levels in bsk co-authored-by: Agnes Leroy <agnes.leroy@zama.ai>	2024-11-05 17:23:57 +01:00
Arthur Meyre	dda93889da	chore: update data backward compatibility branch	2024-11-05 17:23:57 +01:00
Arthur Meyre	748b88e905	chore(tfhe): update version to 0.10.0	2024-11-05 17:23:57 +01:00
Arthur Meyre	612657260f	chore: bump CUDA backend version to 0.6.0	2024-11-05 17:23:57 +01:00
Nicolas Sarlin	6ee3eb17b9	chore(zk): add a proof compat test between x86_64 and wasm	2024-11-05 17:07:04 +01:00
Agnes Leroy	c1374a0e10	chore(gpu): increase sm for rtxa6000	2024-11-05 12:11:36 +01:00
Agnes Leroy	a9601fc47d	chore(gpu): remove decompressed ct comparison btw cpu and gpu The results are not expected to match bitwise	2024-11-04 15:01:53 -03:00
Agnes Leroy	bd255cd958	chore(gpu): rework ci to adapt to the shortage of h100	2024-11-04 15:23:43 +01:00
Arthur Meyre	6fe36799fd	chore(ci): fix clippy issue for M1 build	2024-11-04 12:53:58 +01:00
dependabot[bot]	02419d6852	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.15 to 3.0.16. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`ed00f72a3c...38608ef4fb`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-11-04 12:53:09 +01:00