add back functions

More prints degrees
Debug with long run tests
2026-01-11 07:38:08 -05:00 · 2025-06-04 12:53:45 +02:00 · 2025-06-04 12:53:44 +02:00 · 2025-06-04 12:53:44 +02:00 · 2025-06-04 12:53:32 +02:00 · 2025-06-04 12:53:32 +02:00
120 changed files with 3037 additions and 1231 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -33,7 +33,9 @@ runs:
      if: inputs.github-instance == 'true'
      shell: bash
      run: |
-        TOOLKIT_VERSION="$(echo ${CUDA_VERSION} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+        # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+        # shellcheck disable=SC2001
+        TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -67,7 +67,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -126,9 +126,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -174,7 +174,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -182,6 +182,7 @@ jobs:
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
          make test_tfhe_csprng
+          make test_tfhe_csprng_big_endian

      - name: Run tfhe-zk-pok tests
        if: needs.should-run.outputs.zk_pok_test == 'true'
@@ -272,9 +273,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -114,7 +114,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -142,9 +142,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -115,7 +115,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -147,9 +147,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -185,7 +185,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -254,9 +254,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -68,7 +68,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -123,9 +123,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -114,8 +117,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -116,8 +119,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -59,14 +59,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -111,8 +114,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -46,15 +46,18 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+            echo "FAST_BENCH=TRUE";
          } >> "${GITHUB_ENV}"
-          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -93,8 +96,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -124,14 +130,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -170,8 +179,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -120,26 +120,33 @@ jobs:
        env:
          INPUTS_PARAMS_TYPE: ${{ inputs.params_type }}

-
      - name: Set command output
        id: set_command
        run: |
-          echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
+          echo "command=${COMMAND_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          COMMAND_OUTPUT: ${{ toJSON(env.COMMAND) }}

      - name: Set operation flavor output
        id: set_op_flavor
        run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+          echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}

      - name: Set benchmark types output
        id: set_bench_type
        run: |
-          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+          echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}

      - name: Set parameters types output
        id: set_params_type
        run: |
-          echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"
+          echo "params_type=${PARAMS_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          PARAMS_TYPE_OUTPUT: ${{ toJSON(env.PARAMS_TYPE) }}

  setup-instance:
    name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
@@ -227,6 +234,8 @@ jobs:
        include:
          - cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -237,18 +246,20 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      # Re-export environment variables as dependencies setup perform this task in the previous job.
      # Local env variables are cleaned at the end of each job.
      - name: Export CUDA variables
        shell: bash
        run: |
-          CUDA_PATH=/usr/local/cuda-${{ matrix.cuda }}
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
@@ -258,13 +269,15 @@ jobs:
        shell: bash
        run: |
          {
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+          echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -317,8 +330,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -119,14 +119,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -167,8 +170,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -120,14 +120,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -168,8 +171,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -37,14 +37,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -84,5 +87,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -79,12 +79,16 @@ jobs:
      - name: Set operation flavor output
        id: set_op_flavor
        run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+          echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}

      - name: Set benchmark types output
        id: set_bench_type
        run: |
-          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+          echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}

  setup-instance:
    name: Setup instance (integer-benchmarks)
@@ -128,14 +132,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -193,8 +200,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -48,7 +48,9 @@ jobs:
      - name: Set operation flavor output
        id: set_op_flavor
        run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+          echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}

  setup-instance:
    name: Setup instance (shortint-benchmarks)
@@ -89,14 +91,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -150,8 +155,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -79,12 +79,16 @@ jobs:
      - name: Set operation flavor output
        id: set_op_flavor
        run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+          echo "op_flavor=${OP_FLAVOR_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          OP_FLAVOR_OUTPUT: ${{ toJSON(env.OP_FLAVOR) }}

      - name: Set benchmark types output
        id: set_bench_type
        run: |
-          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+          echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}

  setup-instance:
    name: Setup instance (signed-integer-benchmarks)
@@ -128,14 +132,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -185,8 +192,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -61,11 +61,14 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -61,11 +61,14 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -98,14 +98,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -155,8 +158,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -96,14 +96,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -136,12 +139,16 @@ jobs:

      - name: Install web resources
        run: |
-          make install_${{ matrix.browser }}_browser
-          make install_${{ matrix.browser }}_web_driver
+          make install_"${BROWSER}"_browser
+          make install_"${BROWSER}"_web_driver
+        env:
+          BROWSER: ${{ matrix.browser }}

      - name: Run benchmarks
        run: |
-          make bench_web_js_api_parallel_${{ matrix.browser }}_ci
+          make bench_web_js_api_parallel_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}

      - name: Parse results
        run: |
@@ -188,8 +195,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -93,7 +93,9 @@ jobs:
      - name: Set benchmark types output
        id: set_bench_type
        run: |
-          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+          echo "bench_type=${BENCH_TYPE_OUTPUT}" >> "${GITHUB_OUTPUT}"
+        env:
+          BENCH_TYPE_OUTPUT: ${{ toJSON(env.BENCH_TYPE) }}

  setup-instance:
    name: Setup instance (pke-zk-benchmarks)
@@ -140,14 +142,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -205,8 +210,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -35,7 +35,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -25,10 +25,10 @@ jobs:

      - name: Get actionlint
        run: |
-          wget "https://github.com/rhysd/actionlint/releases/download/v${{ env.ACTIONLINT_VERSION }}/actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz"
-          echo "${{ env.ACTIONLINT_CHECKSUM }} actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz" > checksum
+          wget "https://github.com/rhysd/actionlint/releases/download/v${ACTIONLINT_VERSION}/actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz"
+          echo "${ACTIONLINT_CHECKSUM} actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz" > checksum
          sha256sum -c checksum
-          tar -xf actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz actionlint
+          tar -xf actionlint_"${ACTIONLINT_VERSION}"_linux_amd64.tar.gz actionlint
          ln -s "$(pwd)/actionlint" /usr/local/bin/

      - name: Lint workflows
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -54,7 +54,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -66,7 +66,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -59,7 +59,7 @@ jobs:
          echo 'GH_API_RES<<EOF'
          curl --fail-with-body --no-progress-meter -L -X POST \
          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "Authorization: Bearer ${TOKEN}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
          "${COMMENTS_URL}" \
          -d "${BODY}"
@@ -71,6 +71,7 @@ jobs:
        REPO: ${{ github.repository }}
        EVENT_NUMBER: ${{ github.event.number }}
        COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}
+        TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}

    - name: Merge the Pull Request in the data repo
      if: ${{ github.event.pull_request.merged }}
@@ -81,7 +82,7 @@ jobs:
          echo 'GH_API_RES<<EOF'
          curl --fail-with-body --no-progress-meter -L -X PUT \
          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "Authorization: Bearer ${TOKEN}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
          "${TARGET_REPO_PR_URL}"/merge \
          -d '{ "merge_method": "rebase" }'
@@ -91,6 +92,7 @@ jobs:
        exit $RES
      env:
        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
+        TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}

    - name: Close the Pull Request in the data repo
      if: ${{ !github.event.pull_request.merged }}
@@ -101,7 +103,7 @@ jobs:
          echo 'GH_API_RES<<EOF'
          curl --fail-with-body --no-progress-meter -L -X PATCH \
          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "Authorization: Bearer ${TOKEN}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
          "${TARGET_REPO_PR_URL}" \
          -d '{ "state": "closed" }'
@@ -111,6 +113,7 @@ jobs:
        exit $RES
      env:
        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
+        TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}

    - name: Delete the associated branch in the data repo
      run: |
@@ -120,13 +123,15 @@ jobs:
          echo 'GH_API_RES<<EOF'
          curl --fail-with-body --no-progress-meter -L -X DELETE \
          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
+          -H "Authorization: Bearer ${TOKEN}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
          "${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
          RES="$?"
          echo EOF
        } >> "${GITHUB_ENV}"
        exit $RES
+      env:
+        TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}

    - name: Slack Notification
      if: ${{ always() && job.status == 'failure' }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -45,7 +45,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -172,9 +172,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -124,7 +124,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -79,7 +79,7 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -161,9 +161,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -72,7 +72,7 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -81,16 +81,20 @@ jobs:
        if: env.SECRETS_AVAILABLE == 'false'
        shell: bash
        run: |
-          TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"${CUDA_KEYRING_PACKAGE}"
          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
          sha256sum -c checksum
          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -100,17 +104,21 @@ jobs:
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc" >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Run fmt checks
        run: |
@@ -127,9 +135,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -144,9 +144,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -158,9 +158,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -130,7 +130,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -144,9 +144,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -158,9 +158,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -130,7 +130,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -70,4 +70,4 @@ jobs:
          source setup_hpu.sh
          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
          make HPU_CONFIG=sim test_high_level_api_hpu
-     
+          make HPU_CONFIG=sim test_user_doc_hpu
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -57,7 +57,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -46,7 +46,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -67,7 +67,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -78,17 +78,19 @@ jobs:
          {
            echo "CUDA_PATH=$CUDA_PATH";
            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+            echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
          } >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
      - name: Prepare package
@@ -129,7 +131,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -140,19 +142,23 @@ jobs:
          {
            echo "CUDA_PATH=$CUDA_PATH";
            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+            echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
          } >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Publish crate.io package
        env:
--- a/26
+++ b/26
@@ -170,9 +170,13 @@ install_typos_checker: install_rs_build_toolchain
 .PHONY: install_zizmor # Install zizmor workflow security checker
 install_zizmor: install_rs_build_toolchain
 	@zizmor --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
 	( echo "Unable to install zizmor, unknown error." && exit 1 )

+.PHONY: install_cargo_cross # Install custom tfhe-rs lints
+install_cargo_cross: install_rs_build_toolchain
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
+
 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
 	python3 -m venv venv
@@ -935,9 +939,21 @@ test_user_doc: install_rs_build_toolchain
 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
+		--features=internal-keycache,integer,zk-pok,gpu -p $(TFHE_SPEC) \
 		-- test_user_docs::

+.PHONY: test_user_doc_hpu # Run tests for HPU from the .md documentation
+test_user_doc_hpu: install_rs_build_toolchain
+ifeq ($(HPU_CONFIG), v80)
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=internal-keycache,integer,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		-- test_user_docs::
+else
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=internal-keycache,integer,hpu -p $(TFHE_SPEC) \
+		-- test_user_docs::
+endif
+


 .PHONY: test_regex_engine # Run tests for regex_engine example
@@ -968,6 +984,12 @@ test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng

+.PHONY: test_tfhe_csprng_big_endian # Run tfhe-csprng tests on an emulated big endian system
+test_tfhe_csprng_big_endian: install_rs_build_toolchain install_cargo_cross
+	RUSTFLAGS="" cross $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
+
+
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -400,7 +400,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
+    CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
+    int8_t *mem_ptr, void *const *bsks,
    void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

@@ -538,5 +539,13 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

+void extend_radix_with_trivial_zero_blocks_msb_64(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    void *const *streams, uint32_t const *gpu_indexes);
+
+void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              void *const *streams,
+                              uint32_t const *gpu_indexes);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -1116,18 +1116,116 @@ template <typename Torus> struct int_overflowing_sub_memory {
 };

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
-  CudaRadixCiphertextFFI *new_blocks;
-  CudaRadixCiphertextFFI *new_blocks_copy;
-  CudaRadixCiphertextFFI *old_blocks;
-  CudaRadixCiphertextFFI *small_lwe_vector;
+
  int_radix_params params;
-
-  int32_t *d_smart_copy_in;
-  int32_t *d_smart_copy_out;
-
-  bool mem_reuse = false;
+  size_t max_total_blocks_in_vec;
+  uint32_t num_blocks_in_radix;
+  uint32_t max_num_radix_in_vec;
+  uint64_t *size_tracker;
  bool gpu_memory_allocated;

+  // temporary buffers
+  CudaRadixCiphertextFFI *current_blocks;
+  CudaRadixCiphertextFFI *small_lwe_vector;
+
+  uint32_t *d_columns_data;
+  uint32_t *d_columns_counter;
+  uint32_t **d_columns;
+
+  uint32_t *d_new_columns_data;
+  uint32_t *d_new_columns_counter;
+  uint32_t **d_new_columns;
+
+  uint64_t *d_degrees;
+  uint32_t *d_pbs_counters;
+
+  // lookup table for extracting message and carry
+  int_radix_lut<Torus> *luts_message_carry;
+
+  bool mem_reuse = false;
+
+  void setup_index_buffers(cudaStream_t const *streams,
+                           uint32_t const *gpu_indexes) {
+
+    d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
+        max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
+        size_tracker, gpu_memory_allocated);
+
+    d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
+        3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
+        gpu_memory_allocated);
+
+    auto num_blocks_in_radix = this->num_blocks_in_radix;
+    auto max_num_radix_in_vec = this->max_num_radix_in_vec;
+    auto setup_columns =
+        [num_blocks_in_radix, max_num_radix_in_vec, streams,
+         gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
+                      uint32_t *&columns_counter, uint64_t *size_tracker,
+                      bool gpu_memory_allocated) {
+          columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
+              num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
+              streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
+          columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
+              num_blocks_in_radix * sizeof(uint32_t), streams[0],
+              gpu_indexes[0], size_tracker, gpu_memory_allocated);
+          cuda_memset_with_size_tracking_async(
+              columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
+              streams[0], gpu_indexes[0], gpu_memory_allocated);
+          uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
+          for (int i = 0; i < num_blocks_in_radix; ++i) {
+            h_columns[i] = columns_data + i * max_num_radix_in_vec;
+          }
+          columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
+              num_blocks_in_radix * sizeof(uint32_t *), streams[0],
+              gpu_indexes[0], size_tracker, gpu_memory_allocated);
+          cuda_memcpy_async_to_gpu(columns, h_columns,
+                                   num_blocks_in_radix * sizeof(uint32_t *),
+                                   streams[0], gpu_indexes[0]);
+          cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+          delete[] h_columns;
+        };
+
+    setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
+                  gpu_memory_allocated);
+    setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
+                  size_tracker, gpu_memory_allocated);
+  }
+
+  void setup_lookup_tables(cudaStream_t const *streams,
+                           uint32_t const *gpu_indexes, uint32_t gpu_count) {
+    uint32_t message_modulus = params.message_modulus;
+
+    if (!mem_reuse) {
+      luts_message_carry = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
+          gpu_memory_allocated, size_tracker);
+    }
+    auto message_acc = luts_message_carry->get_lut(0, 0);
+    auto carry_acc = luts_message_carry->get_lut(0, 1);
+
+    // define functions for each accumulator
+    auto lut_f_message = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+      return x / message_modulus;
+    };
+
+    // generate accumulators
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], message_acc,
+        luts_message_carry->get_degree(0),
+        luts_message_carry->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, message_modulus, params.carry_modulus,
+        lut_f_message, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], carry_acc,
+        luts_message_carry->get_degree(1),
+        luts_message_carry->get_max_degree(1), params.glwe_dimension,
+        params.polynomial_size, message_modulus, params.carry_modulus,
+        lut_f_carry, gpu_memory_allocated);
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+  }
  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, int_radix_params params,
@@ -1136,103 +1234,84 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
                                 bool allocate_gpu_memory,
                                 uint64_t *size_tracker) {
    this->params = params;
-    gpu_memory_allocated = allocate_gpu_memory;
+    this->mem_reuse = false;
+    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
+    this->num_blocks_in_radix = num_blocks_in_radix;
+    this->max_num_radix_in_vec = max_num_radix_in_vec;
+    this->gpu_memory_allocated = allocate_gpu_memory;
+    this->size_tracker = size_tracker;

-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+    setup_index_buffers(streams, gpu_indexes);
+    setup_lookup_tables(streams, gpu_indexes, gpu_count);

-    // allocate gpu memory for intermediate buffers
-    new_blocks = new CudaRadixCiphertextFFI;
+    // create and allocate intermediate buffers
+    current_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-    new_blocks_copy = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-    old_blocks = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
+        streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
+        streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
  }

  int_sum_ciphertexts_vec_memory(
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
-      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
-      CudaRadixCiphertextFFI *old_blocks,
-      CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
+      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
+      CudaRadixCiphertextFFI *small_lwe_vector,
+      int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
      uint64_t *size_tracker) {
-    mem_reuse = true;
-    gpu_memory_allocated = allocate_gpu_memory;
+    this->mem_reuse = true;
    this->params = params;
+    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
+    this->num_blocks_in_radix = num_blocks_in_radix;
+    this->max_num_radix_in_vec = max_num_radix_in_vec;
+    this->gpu_memory_allocated = allocate_gpu_memory;
+    this->size_tracker = size_tracker;

-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
-
-    // assign  gpu memory for intermediate buffers
-    this->new_blocks = new_blocks;
-    this->old_blocks = old_blocks;
+    this->current_blocks = current_blocks;
    this->small_lwe_vector = small_lwe_vector;
-    new_blocks_copy = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
+    this->luts_message_carry = reused_lut;
+    setup_index_buffers(streams, gpu_indexes);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
+    cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
+
+    cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+
+    cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);

    if (!mem_reuse) {
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
-                                     gpu_memory_allocated);
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
                                     gpu_memory_allocated);
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     small_lwe_vector, gpu_memory_allocated);
+      luts_message_carry->release(streams, gpu_indexes, gpu_count);
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      delete new_blocks;
-      delete old_blocks;
+
+      delete current_blocks;
      delete small_lwe_vector;
+      delete luts_message_carry;
    }
-    release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
-                                   gpu_memory_allocated);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    delete new_blocks_copy;
  }
 };
+
 // For sequential algorithm in group propagation
 template <typename Torus> struct int_seq_group_prop_memory {

@@ -2549,7 +2628,7 @@ template <typename Torus> struct int_mul_memory {
    // radix_lwe_left except the last blocks of each shift
    int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;

-    int total_block_count = lsb_vector_block_count + msb_vector_block_count;
+    int total_block_count = num_radix_blocks * num_radix_blocks;

    // allocate memory for intermediate buffers
    vector_result_sb = new CudaRadixCiphertextFFI;
@@ -2562,13 +2641,13 @@ template <typename Torus> struct int_mul_memory {
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
+        streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);

    // create int_radix_lut objects for lsb, msb, message, carry
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
-                                          params, 2, total_block_count,
+                                          params, 2, 2 * total_block_count,
                                          allocate_gpu_memory, size_tracker);
    auto lsb_acc = luts_array->get_lut(0, 0);
    auto msb_acc = luts_array->get_lut(0, 1);
@@ -2602,9 +2681,10 @@ template <typename Torus> struct int_mul_memory {

    luts_array->broadcast_lut(streams, gpu_indexes, 0);
    // create memory object for sum ciphertexts
+    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
+        2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
        allocate_gpu_memory, size_tracker);
    uint32_t uses_carry = 0;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -3918,7 +3998,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_did_not_happen[0]->get_degree(0),
        zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, 2, gpu_memory_allocated);
+        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
+        gpu_memory_allocated);
    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
                                                          0);
    generate_device_accumulator_bivariate_with_factor<Torus>(
@@ -3927,7 +4008,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_did_not_happen[1]->get_degree(0),
        zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, 3, gpu_memory_allocated);
+        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
+        gpu_memory_allocated);
    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
                                                          0);

@@ -3954,7 +4036,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_happened[0]->get_degree(0),
        zero_out_if_overflow_happened[0]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, 2, gpu_memory_allocated);
+        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
+        gpu_memory_allocated);
    zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes, 0);
    generate_device_accumulator_bivariate_with_factor<Torus>(
        streams[0], gpu_indexes[0],
@@ -3962,7 +4045,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_happened[1]->get_degree(0),
        zero_out_if_overflow_happened[1]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, 3, gpu_memory_allocated);
+        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
+        gpu_memory_allocated);
    zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes, 0);

    // merge_overflow_flags_luts
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -63,7 +63,7 @@ void update_degrees_after_bitor(uint64_t *output_degrees,
    auto result = max;

    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
+      if ((max | j) > result) {
        result = max | j;
      }
    }
@@ -82,7 +82,7 @@ void update_degrees_after_bitxor(uint64_t *output_degrees,

    // Try every possibility to find the worst case
    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
+      if ((max ^ j) > result) {
        result = max ^ j;
      }
    }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -36,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
    update_degrees_after_bitor(degrees, lwe_array_1->degrees,
                               lwe_array_2->degrees,
                               lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITXOR) {
+  } else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
    update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
                                lwe_array_2->degrees,
                                lwe_array_1->num_radix_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -0,0 +1,17 @@
+#include "cast.cuh"
+
+void extend_radix_with_trivial_zero_blocks_msb_64(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    void *const *streams, uint32_t const *gpu_indexes) {
+  host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
+      output, input, (cudaStream_t *)streams, gpu_indexes);
+}
+
+void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              void *const *streams,
+                              uint32_t const *gpu_indexes) {
+
+  host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
+                                       gpu_indexes);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -0,0 +1,36 @@
+#ifndef CAST_CUH
+#define CAST_CUH
+
+#include "device.h"
+#include "integer.cuh"
+#include "integer/integer_utilities.h"
+
+template <typename Torus>
+__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    cudaStream_t const *streams, uint32_t const *gpu_indexes) {
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                           0, input->num_radix_blocks, input, 0,
+                                           input->num_radix_blocks);
+}
+
+template <typename Torus>
+__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
+                                         CudaRadixCiphertextFFI const *input,
+                                         cudaStream_t const *streams,
+                                         uint32_t const *gpu_indexes) {
+
+  const uint32_t input_start_lwe_index =
+      input->num_radix_blocks - output->num_radix_blocks;
+
+  if (input->num_radix_blocks <= output->num_radix_blocks) {
+    PANIC("Cuda error: input num blocks should be greater than output num "
+          "blocks");
+  }
+
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
+      input_start_lwe_index, input->num_radix_blocks);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -456,7 +456,7 @@ __host__ void tree_sign_reduction(
  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
-                       4);
+                       message_modulus);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -477,16 +477,17 @@ __host__ void tree_sign_reduction(
  auto last_lut = tree_buffer->tree_last_leaf_lut;
  auto block_selector_f = tree_buffer->block_selector_f;
  std::function<Torus(Torus)> f;
-
+  auto num_bits_in_message = log2_int(params.message_modulus);
  if (partial_block_count == 2) {
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
-                       4);
+                       message_modulus);

-    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
-      int msb = (x >> 2) & 3;
-      int lsb = x & 3;
+    f = [block_selector_f, sign_handler_f, num_bits_in_message,
+         message_modulus](Torus x) -> Torus {
+      Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
+      Torus lsb = x & (message_modulus - 1);

-      int final_sign = block_selector_f(msb, lsb);
+      Torus final_sign = block_selector_f(msb, lsb);
      return sign_handler_f(final_sign);
    };
  } else {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -386,8 +386,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
                         subtraction_overflowed,
                         at_least_one_upper_block_is_non_zero, 1);

-    int factor = (i) ? 3 : 2;
-    int factor_lut_id = factor - 2;
+    auto message_modulus = radix_params.message_modulus;
+    int factor = (i) ? message_modulus - 1 : message_modulus - 2;
+    int factor_lut_id = (i) ? 1 : 0;
    for (size_t k = 0;
         k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
      copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -520,8 +520,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  if (num_radix_blocks > lut->num_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
-      num_radix_blocks > lwe_array_in->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of input & output radix blocks")

@@ -1616,10 +1615,12 @@ __host__ void reduce_signs(
  auto message_modulus = params.message_modulus;
  auto carry_modulus = params.carry_modulus;

+  auto num_bits_in_message = log2_int(message_modulus);
  std::function<Torus(Torus)> reduce_two_orderings_function =
-      [diff_buffer, sign_handler_f](Torus x) -> Torus {
-    int msb = (x >> 2) & 3;
-    int lsb = x & 3;
+      [diff_buffer, sign_handler_f, num_bits_in_message,
+       message_modulus](Torus x) -> Torus {
+    Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
+    Torus lsb = x & (message_modulus - 1);

    return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
  };
@@ -1640,7 +1641,7 @@ __host__ void reduce_signs(

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
-                         num_sign_blocks, 4);
+                         num_sign_blocks, message_modulus);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          ms_noise_reduction_key, lut, num_sign_blocks / 2);
@@ -1669,7 +1670,8 @@ __host__ void reduce_signs(
        message_modulus, carry_modulus, final_lut_f, true);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a, 2, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                       num_sign_blocks, message_modulus);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
        ms_noise_reduction_key, lut, 1);
@@ -1677,8 +1679,8 @@ __host__ void reduce_signs(
  } else {

    std::function<Torus(Torus)> final_lut_f =
-        [mem_ptr, sign_handler_f](Torus x) -> Torus {
-      return sign_handler_f(x & 3);
+        [mem_ptr, sign_handler_f, message_modulus](Torus x) -> Torus {
+      return sign_handler_f(x & (message_modulus - 1));
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -226,7 +226,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
+    CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
+    int8_t *mem_ptr, void *const *bsks,
    void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

@@ -234,64 +235,59 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  // FIXME: this should not be necessary, we should make sure sum_ctxt works in
-  // the general case
-  for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
-    radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
-  }
  switch (mem->params.polynomial_size) {
  case 512:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
+        ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -20,28 +20,11 @@
 #include <fstream>
 #include <iostream>
 #include <omp.h>
+#include <queue>
 #include <sstream>
 #include <string>
 #include <vector>

-template <typename Torus>
-__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
-                           int32_t *id_in, size_t lwe_size) {
-  size_t tid = threadIdx.x;
-  size_t b_id = blockIdx.x;
-  size_t stride = blockDim.x;
-
-  auto input_id = id_in[b_id];
-  auto output_id = id_out[b_id];
-
-  auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
-  auto cur_dst = &dst[output_id * lwe_size];
-
-  for (int i = tid; i < lwe_size; i += stride) {
-    cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
-  }
-}
-
 template <typename Torus, class params>
 __global__ void
 all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -94,33 +77,173 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-template <typename Torus>
-__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
-                                uint32_t chunk_size, uint32_t block_size,
-                                uint32_t num_blocks) {
+__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
+                                            uint32_t *const columns_counter,
+                                            const uint64_t *const degrees,
+                                            const uint32_t num_radix_blocks,
+                                            const uint32_t num_radix_in_vec) {

-  size_t stride = blockDim.x;
-  size_t chunk_id = blockIdx.x;
-  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
-  size_t radix_elem_size = num_blocks * block_size;
-  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
-  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
-  size_t block_stride = blockIdx.y * block_size;
-  auto result = &dst_radix[block_stride];
-
-  // init shared mem with first radix of chunk
-  size_t tid = threadIdx.x;
-  for (int i = tid; i < block_size; i += stride) {
-    result[i] = src_chunk[block_stride + i];
-  }
-
-  // accumulate rest  of the radixes
-  for (int r_id = 1; r_id < chunk_size; r_id++) {
-    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
-    for (int i = tid; i < block_size; i += stride) {
-      result[i] += cur_src_radix[block_stride + i];
+  const uint32_t idx = threadIdx.x;
+  size_t cnt = 0;
+  for (int i = 0; i < num_radix_in_vec; i++) {
+    size_t ct_id = i * num_radix_blocks + idx;
+    if (degrees[ct_id] != 0) {
+      columns[idx][cnt] = ct_id;
+      ++cnt;
    }
  }
+  columns_counter[idx] = cnt;
+}
+
+template <typename Torus>
+__global__ inline void prepare_new_columns_and_pbs_indexes(
+    uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
+    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
+    Torus *const lut_indexes, uint32_t *const pbs_counters,
+    const uint32_t *const *const columns, const uint32_t *const columns_counter,
+    const uint32_t chunk_size) {
+  __shared__ uint32_t counter, sharedOr;
+
+  if (threadIdx.x == 0) {
+    counter = 0;
+    sharedOr = 0;
+  }
+  __syncthreads();
+
+  const uint32_t base_id = threadIdx.x;
+  const uint32_t column_len = columns_counter[base_id];
+
+  uint32_t ct_count = 0;
+  for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
+    // those indexes are for message ciphertexts
+    // for message ciphertexts in and out index should be same
+    const uint32_t in_index = columns[base_id][i];
+    new_columns[base_id][ct_count] = in_index;
+    const uint32_t pbs_index = atomicAdd(&counter, 1);
+    pbs_indexes_in[pbs_index] = in_index;
+    pbs_indexes_out[pbs_index] = in_index;
+    lut_indexes[pbs_index] = 0;
+    ++ct_count;
+  }
+//  ct1 ct2 ct3
+// pbs_indexes: 0, 1, 2
+// pbs_indexes: 2, 1, 0
+
+  __syncthreads();
+  uint32_t message_count = counter;
+
+  if (base_id > 0) {
+    const uint32_t prev_base_id = base_id - 1;
+    const uint32_t prev_column_len = columns_counter[prev_base_id];
+
+    for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
+      // those indexes are for carry ciphertexts
+      // for carry ciphertexts input is same as for message
+      // output will be placed to next block in the column
+      const uint32_t in_index = columns[prev_base_id][i];
+      const uint32_t out_index = columns[prev_base_id][i + 1];
+      new_columns[base_id][ct_count] = out_index;
+      const uint32_t pbs_index = atomicAdd(&counter, 1);
+      pbs_indexes_in[pbs_index] = in_index;
+      pbs_indexes_out[pbs_index] = out_index;
+      lut_indexes[pbs_index] = 1;
+      ++ct_count;
+    }
+  }
+
+  const uint32_t start_index = column_len - column_len % chunk_size;
+  for (uint32_t i = start_index; i < column_len; ++i) {
+    new_columns[base_id][ct_count] = columns[base_id][i];
+    ++ct_count;
+  }
+
+  new_columns_counter[base_id] = ct_count;
+
+  if (ct_count > chunk_size) {
+    atomicOr(&sharedOr, 1);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    pbs_counters[0] = counter;
+    pbs_counters[1] = message_count;
+    pbs_counters[2] = sharedOr;
+  }
+}
+
+template <typename Torus>
+__global__ inline void prepare_final_pbs_indexes(
+    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
+    Torus *const lut_indexes, const uint32_t num_radix_blocks) {
+  int idx = threadIdx.x;
+  pbs_indexes_in[idx] = idx % num_radix_blocks;
+  pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
+  lut_indexes[idx] = idx / num_radix_blocks;
+}
+
+template <typename Torus>
+__global__ void calculate_chunks(Torus *const input_blocks,
+                                 const uint32_t *const *const columns,
+                                 const uint32_t *const columns_counter,
+                                 const uint32_t chunk_size,
+                                 const uint32_t block_size) {
+
+  const uint32_t part_size = blockDim.x;
+  const uint32_t base_id = blockIdx.x;
+  const uint32_t part_id = blockIdx.y;
+  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+
+  if (coef_id >= block_size)
+    return;
+
+  const uint32_t column_len = columns_counter[base_id];
+
+  if (column_len >= chunk_size) {
+    const uint32_t num_chunks = column_len / chunk_size;
+    Torus result = 0;
+
+    for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
+      const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
+      result = input_blocks[first_ct_id * block_size + coef_id];
+
+      for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
+        const uint32_t cur_ct_id =
+            columns[base_id][chunk_id * chunk_size + ct_id];
+        result += input_blocks[cur_ct_id * block_size + coef_id];
+      }
+
+      input_blocks[first_ct_id * block_size + coef_id] = result;
+    }
+  }
+}
+
+template <typename Torus>
+__global__ void calculate_final_chunk_into_radix(
+    Torus *const out_radix, const Torus *const input_blocks,
+    const uint32_t *const *const columns, const uint32_t *const columns_counter,
+    const uint32_t chunk_size, const uint32_t block_size) {
+
+  const uint32_t part_size = blockDim.x;
+  const uint32_t base_id = blockIdx.x;
+  const uint32_t part_id = blockIdx.y;
+  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+
+  if (coef_id >= block_size)
+    return;
+
+  const uint32_t column_len = columns_counter[base_id];
+
+  Torus result = 0;
+  if (column_len) {
+    const uint32_t first_ct_id = columns[base_id][0];
+    result = input_blocks[first_ct_id * block_size + coef_id];
+
+    for (uint32_t i = 1; i < column_len; ++i) {
+      const uint32_t cur_ct_it = columns[base_id][i];
+      result += input_blocks[cur_ct_it * block_size + coef_id];
+    }
+  }
+  out_radix[base_id * block_size + coef_id] = result;
 }

 template <typename Torus, class params>
@@ -167,6 +290,65 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
+
+inline bool at_least_one_column_needs_processing(
+    const uint64_t *const degrees, const uint32_t num_radix_blocks,
+    const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
+  std::vector<uint32_t> columns_count(num_radix_blocks, 0);
+
+  for (size_t column = 0; column < num_radix_blocks; ++column) {
+    for (size_t block = 0; block < num_radix_in_vec; ++block) {
+      const size_t block_index = block * num_radix_blocks + column;
+      if (degrees[block_index]) {
+        columns_count[column]++;
+        if (columns_count[column] > chunk_size) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+inline void calculate_final_degrees(uint64_t *const out_degrees,
+                                    const uint64_t *const input_degrees,
+                                    size_t num_blocks, size_t num_radix_in_vec,
+                                    size_t chunk_size,
+                                    uint64_t message_modulus) {
+
+  auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
+    return std::min(message_modulus - 1, degree);
+  };
+  std::vector<std::queue<uint64_t>> columns(num_blocks);
+  for (size_t i = 0; i < num_radix_in_vec; ++i) {
+    for (size_t j = 0; j < num_blocks; ++j) {
+      if (input_degrees[i * num_blocks + j])
+        columns[j].push(input_degrees[i * num_blocks + j]);
+    }
+  }
+
+  for (size_t i = 0; i < num_blocks; ++i) {
+    auto &col = columns[i];
+    while (col.size() > 1) {
+      uint32_t cur_degree = 0;
+      size_t mn = std::min(chunk_size, col.size());
+      for (int j = 0; j < mn; ++j) {
+        cur_degree += col.front();
+        col.pop();
+      }
+      const uint64_t new_degree = get_degree(cur_degree);
+      col.push(new_degree);
+      if ((i + 1) < num_blocks) {
+        columns[i + 1].push(new_degree);
+      }
+    }
+  }
+
+  for (int i = 0; i < num_blocks; i++) {
+    out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
+  }
+}
+
 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -181,15 +363,107 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
  return size_tracker;
 }

+__global__ inline void DEBUG_PRINT_COLUMNS(uint32_t *const *const columns,
+                                            uint32_t *const columns_counter,
+                                            const uint32_t num_radix_blocks) {
+  printf("cuda_columns_counter:\n");
+  for (int i = 0; i < num_radix_blocks; i++) {
+    printf("%d ", columns_counter[i]);
+  }
+  printf("\n");
+  printf("cuda_columns:\n");
+
+  for (int i = 0; i < num_radix_blocks; i++) {
+    printf("column[%d]: ", i);
+    for (int j = 0; j < columns_counter[i]; j++)
+    {
+      printf("%d ", columns[i][j]);
+    }
+    printf("\n");
+  }
+
+  printf("\n");
+
+}
+
+__global__ inline void DEBUG_PRINT_COLUMNS_DATA(uint32_t *const *const columns,
+                                           uint32_t *const columns_counter,
+                                           uint64_t* data,
+                                           const uint32_t num_radix_blocks, size_t lwe_size) {
+
+  uint64_t delta = 576460752303423488ULL;
+  __syncthreads();
+  printf("cuda_new_columns:\n");
+  __syncthreads();
+  for (int i = 0; i < num_radix_blocks; i++) {
+    __syncthreads();
+    printf("column[%d]: ", i);
+    __syncthreads();
+    for (int j = 0; j < columns_counter[i]; j++)
+    {
+      __syncthreads();
+      auto cur_data =data[ columns[i][j] * lwe_size + lwe_size - 1];
+      cur_data /= delta;
+      printf("%llu ", cur_data);
+      __syncthreads();
+    }
+    __syncthreads();
+    printf("\n");
+    __syncthreads();
+  }
+
+  __syncthreads();
+  printf("\n");
+  __syncthreads();
+
+}
+
+template<typename Torus, bool input, bool clear>
+__global__ inline void DEBUG_PRINT_PBS_DATA(Torus * data, Torus* input_indexes, Torus*
+output_indexes, Torus *lut_indexes, size_t lwe_size, int num) {
+  printf("input_output_indexes: \n");
+
+  for (int i = 0; i < num; i++) {
+    auto input_val = data[input_indexes[i] * lwe_size + lwe_size -1];
+    auto output_val = data[output_indexes[i] * lwe_size + lwe_size -1];
+
+    auto val = input ? input_val : output_val;
+    auto val_clear = clear ? val / 576460752303423488ULL : val;
+
+    printf("%d %lu %lu %lu %lu %lu\n", i, input_indexes[i], output_indexes[i], lut_indexes[i],
+           val_clear, val);
+  }
+}
+
+//template<typename Torus>
+//__global__ inline void DEBUG_PRINT_RADIX(Torus * data, size_t num_blocks, size_t lwe_size) {
+//  for (int i = 0; i < num_blocks; i++) {
+//    auto val = data[i * lwe_size + lwe_size - 1];
+//    auto val_clear = val / 576460752303423488ULL;
+//    printf("cuda_partial_sum_result: %lu %lu\n", val, val_clear);
+//  }
+//}
+
 template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
+    CudaRadixCiphertextFFI *terms, bool reduce_degrees_for_single_carry_propagation, void *const
+    *bsks, uint64_t *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
-    int_radix_lut<Torus> *reused_lut) {
+    uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
+//  cudaDeviceSynchronize();
+//  print_body<Torus>("cuda_input_partial_sum", (Torus*)terms->ptr, num_radix_blocks * num_radix_in_vec,
+//                    2048,
+//                    576460752303423488ULL);
+
+//  for (int i = 0; i <num_radix_blocks * num_radix_in_vec; i++ ) {
+//    printf("cuda_input_degrees: %d\n", terms->degrees[i]);
+//  }
+//  cudaDeviceSynchronize();
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;

  if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
    PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -199,22 +473,28 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    PANIC("Cuda error: input vector does not have enough blocks")
  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
    PANIC("Cuda error: output does not have enough blocks")
-  auto new_blocks = mem_ptr->new_blocks;
-  auto new_blocks_copy = mem_ptr->new_blocks_copy;
-  auto old_blocks = mem_ptr->old_blocks;
+
+  auto current_blocks = mem_ptr->current_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
+  auto d_degrees = mem_ptr->d_degrees;
+  auto d_columns = mem_ptr->d_columns;
+  auto d_columns_counter = mem_ptr->d_columns_counter;
+  auto d_new_columns = mem_ptr->d_new_columns;
+  auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
+  auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
+  auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
+  auto d_pbs_counters = mem_ptr->d_pbs_counters;

-  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
-  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
+  auto luts_message_carry = mem_ptr->luts_message_carry;

-  auto message_modulus = mem_ptr->params.message_modulus;
-  auto carry_modulus = mem_ptr->params.carry_modulus;
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto small_lwe_size = small_lwe_dimension + 1;
+  auto chunk_size =
+      (mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
+      (mem_ptr->params.message_modulus - 1);
+
+  size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;

  // In the case of extracting a single LWE this parameters are dummy
  uint32_t num_many_lut = 1;
@@ -228,244 +508,202 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                             terms, 0, num_radix_blocks);
    return;
  }
-  if (old_blocks != terms) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
-                                       terms);
-  }
+
  if (num_radix_in_vec == 2) {
-    CudaRadixCiphertextFFI old_blocks_slice;
-    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
-                                     num_radix_blocks, 2 * num_radix_blocks);
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                         &old_blocks_slice, num_radix_blocks);
+    CudaRadixCiphertextFFI terms_slice;
+    as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
+                                     2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
+                         &terms_slice, num_radix_blocks);
    return;
  }

-  size_t r = num_radix_in_vec;
-  size_t total_modulus = message_modulus * carry_modulus;
-  size_t message_max = message_modulus - 1;
-  size_t chunk_size = (total_modulus - 1) / message_max;
-
-  size_t h_lwe_idx_in[terms->num_radix_blocks];
-  size_t h_lwe_idx_out[terms->num_radix_blocks];
-  int32_t h_smart_copy_in[terms->num_radix_blocks];
-  int32_t h_smart_copy_out[terms->num_radix_blocks];
-
-  /// Here it is important to query the default max shared memory on device 0
-  /// instead of cuda_get_max_shared_memory,
-  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
-  auto max_shared_memory = 0;
-  check_cuda_error(cudaDeviceGetAttribute(
-      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
-
-  // create lut object for message and carry
-  // we allocate luts_message_carry in the host function (instead of scratch)
-  // to reduce average memory consumption
-  int_radix_lut<Torus> *luts_message_carry;
-  size_t ch_amount = r / chunk_size;
-  if (!ch_amount)
-    ch_amount++;
-  if (reused_lut == nullptr) {
-    luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_radix_blocks, true, nullptr);
-  } else {
-    luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
+  if (current_blocks != terms) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                       current_blocks, terms);
  }
-  auto message_acc = luts_message_carry->get_lut(0, 0);
-  auto carry_acc = luts_message_carry->get_lut(0, 1);

-  // define functions for each accumulator
-  auto lut_f_message = [message_modulus](Torus x) -> Torus {
-    return x % message_modulus;
-  };
-  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-    return x / message_modulus;
-  };
+  cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
+                           total_blocks_in_vec * sizeof(uint64_t), streams[0],
+                           gpu_indexes[0]);

-  // generate accumulators
-  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], message_acc,
-      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
-      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      lut_f_message, true);
-  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
-      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
-      message_modulus, carry_modulus, lut_f_carry, true);
-  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+  cuda_set_device(gpu_indexes[0]);
+  radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
+      d_columns, d_columns_counter, d_degrees, num_radix_blocks,
+      num_radix_in_vec);

-  while (r > 2) {
-    size_t cur_total_blocks = r * num_radix_blocks;
-    size_t ch_amount = r / chunk_size;
-    if (!ch_amount)
-      ch_amount++;
-    dim3 add_grid(ch_amount, num_radix_blocks, 1);
+  bool needs_processing = at_least_one_column_needs_processing(
+      current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);

-    cuda_set_device(gpu_indexes[0]);
-    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
-        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
-        std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
+  int number_of_threads = min(256, params::degree);
+  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
+  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

-    check_cuda_error(cudaGetLastError());
+  // h_pbs_counters[0] - total ciphertexts
+  // h_pbs_counters[1] - message ciphertexts
+  // h_pbs_counters[2] - at_least_one_column_needs_processing
+  uint32_t *h_pbs_counters;
+  cudaMallocHost((void **)&h_pbs_counters, 3 * sizeof(uint32_t));
+  if (mem_ptr->mem_reuse) {
+    mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
+  }

-    size_t total_count = 0;
-    size_t message_count = 0;
-    size_t carry_count = 0;
-    size_t sm_copy_count = 0;

-    generate_ids_update_degrees(
-        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
-        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
-        message_max, total_count, message_count, carry_count, sm_copy_count);
-    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
-    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
-    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
-                                        h_lwe_idx_in, h_lwe_idx_out);
+  while (needs_processing) {
+//    cudaDeviceSynchronize();
+//    DEBUG_PRINT_COLUMNS<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, num_radix_blocks);
+//    DEBUG_PRINT_COLUMNS_DATA<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, (uint64_t *)
+//    (current_blocks->ptr), num_radix_blocks, big_lwe_size);
+    calculate_chunks<Torus>
+        <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
+            (Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
+            chunk_size, big_lwe_size);

-    size_t copy_size = sm_copy_count * sizeof(int32_t);
-    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
-                             streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
-                             streams[0], gpu_indexes[0]);
+    prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
+        d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
+        d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
+        d_pbs_counters, d_columns, d_columns_counter, chunk_size);

-    // inside d_smart_copy_in there are only -1 values
-    // it's fine to call smart_copy with same pointer
-    // as source and destination
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
-        new_blocks, 0, r * num_radix_blocks);
-    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
-        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
-        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
-    check_cuda_error(cudaGetLastError());
+    cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
+                             3 * sizeof(uint32_t), streams[0], gpu_indexes[0]);

-    if (carry_count > 0)
-      cuda_set_value_async<Torus>(
-          streams[0], gpu_indexes[0],
-          luts_message_carry->get_lut_indexes(0, message_count), 1,
-          carry_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+    const uint32_t total_ciphertexts = h_pbs_counters[0];
+    const uint32_t total_messages = h_pbs_counters[1];
+    needs_processing = (h_pbs_counters[2] != 0);
+
+    auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
+
+//    DEBUG_PRINT_PBS_DATA<Torus, true, true><<<1, 1, 0, streams[0]>>>(
+//        (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
+//        luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
+//        );

-    /// For multi GPU execution we create vectors of pointers for inputs and
-    /// outputs
-    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
-    std::vector<Torus *> small_lwe_vector_vec =
-        luts_message_carry->lwe_after_ks_vec;
-    std::vector<Torus *> lwe_after_pbs_vec =
-        luts_message_carry->lwe_after_pbs_vec;
-    std::vector<Torus *> lwe_trivial_indexes_vec =
-        luts_message_carry->lwe_trivial_indexes_vec;

-    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
    if (active_gpu_count == 1) {
-      /// Apply KS to go from a big LWE dimension to a small LWE dimension
-      /// After this keyswitch execution, we need to synchronize the streams
-      /// because the keyswitch and PBS do not operate on the same number of
-      /// inputs
      execute_keyswitch_async<Torus>(
          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
-          polynomial_size * glwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
-
-      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-      /// dimension to a big LWE dimension
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
-          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
-          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
-          ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
-          small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
-    } else {
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
-          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, message_count,
-          big_lwe_size);
-
-      /// Apply KS to go from a big LWE dimension to a small LWE dimension
-      /// After this keyswitch execution, we need to synchronize the streams
-      /// because the keyswitch and PBS do not operate on the same number of
-      /// inputs
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
-          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
+          d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
          ksks, big_lwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
+          total_messages);

-      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
-      /// different configuration
-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
-          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, message_count,
-          small_lwe_size);
-      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-      }
-
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
-          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, total_count,
-          small_lwe_size);
-
-      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-      /// dimension to a big LWE dimension
      execute_pbs_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
-          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
-          lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
+          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
+          d_pbs_indexes_out, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
+          lut_stride);
+    } else {
+      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
+                               luts_message_carry->lwe_indexes_in,
+                               total_ciphertexts * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
+                               luts_message_carry->lwe_indexes_out,
+                               total_ciphertexts * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
-          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
-          luts_message_carry->using_trivial_lwe_indexes, total_count,
-          big_lwe_size);
-      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-      }
-    }
-    for (uint i = 0; i < total_count; i++) {
-      auto degrees_index = luts_message_carry->h_lut_indexes[i];
-      new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
-      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
+      luts_message_carry->using_trivial_lwe_indexes = false;
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, active_gpu_count, current_blocks,
+          current_blocks, bsks, ksks, ms_noise_reduction_key,
+          luts_message_carry, total_ciphertexts);
    }
+    cuda_set_device(gpu_indexes[0]);
+//    DEBUG_PRINT_PBS_DATA<Torus, false, true><<<1, 1, 0, streams[0]>>>(
+//        (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
+//        luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
+//    );

-    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
-    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
-
-    if (rem_blocks > 0)
-      copy_radix_ciphertext_slice_async<Torus>(
-          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
-          new_blocks_created + rem_blocks, old_blocks,
-          cur_total_blocks - rem_blocks, cur_total_blocks);
-    std::swap(new_blocks, old_blocks);
-    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
+    std::swap(d_columns, d_new_columns);
+    std::swap(d_columns_counter, d_new_columns_counter);
  }
-  luts_message_carry->release(streams, gpu_indexes, gpu_count);
-  delete (luts_message_carry);

-  CudaRadixCiphertextFFI old_blocks_slice;
-  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
-                                   num_radix_blocks, 2 * num_radix_blocks);
-  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                       &old_blocks_slice, num_radix_blocks);
+  cudaFreeHost(h_pbs_counters);
+  calculate_final_chunk_into_radix<Torus>
+      <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
+          (Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
+          d_columns, d_columns_counter, chunk_size, big_lwe_size);
+
+  if (reduce_degrees_for_single_carry_propagation) {
+    prepare_final_pbs_indexes<Torus><<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
+        d_pbs_indexes_in, d_pbs_indexes_out,
+        luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
+
+    cuda_memset_async(
+        (Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
+        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+    auto active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count);
+
+    if (active_gpu_count == 1) {
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
+          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
+          mem_ptr->params.ks_level, num_radix_blocks);
+
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
+          d_pbs_indexes_out, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
+          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
+          polynomial_size, mem_ptr->params.pbs_base_log,
+          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+          2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
+          lut_stride);
+    } else {
+      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
+                               luts_message_carry->lwe_indexes_in,
+                               2 * num_radix_blocks * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
+                               luts_message_carry->lwe_indexes_out,
+                               2 * num_radix_blocks * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+      luts_message_carry->using_trivial_lwe_indexes = false;
+
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
+          bsks, ksks, ms_noise_reduction_key, luts_message_carry,
+          2 * num_radix_blocks);
+    }
+//    cudaDeviceSynchronize();
+//    print_body<Torus>("cuda_before_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
+//                      576460752303423488ULL);
+//    cudaDeviceSynchronize();
+    calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
+                            num_radix_blocks, num_radix_in_vec, chunk_size,
+                            mem_ptr->params.message_modulus);
+    cuda_set_device(gpu_indexes[0]);
+    CudaRadixCiphertextFFI current_blocks_slice;
+    as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
+                                     num_radix_blocks, 2 * num_radix_blocks);
+
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
+                         current_blocks, &current_blocks_slice, num_radix_blocks);
+//    printf("add_happened\n");
+  }
+
+
+//  cudaDeviceSynchronize();
+//
+//  print_body<Torus>("cuda_out_after_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
+//                    576460752303423488ULL);
+//  cudaDeviceSynchronize();
 }

 template <typename Torus, class params>
@@ -599,14 +837,28 @@ __host__ void host_integer_mult_radix_kb(
    size_t b_id = i % num_blocks;
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
+
+
+  for (int i = 0; i < num_blocks * 2 * num_blocks; i++)
+  {
+    auto cur_ptr = (Torus*)vector_result_sb->ptr;
+    cur_ptr += i * 2049 + 2048;
+    print_debug<Torus>("", cur_ptr, 1);
+  }
+
+  for (int i = 0; i < num_blocks * 2 * num_blocks; i++) {
+    printf("%d\n", vector_result_sb->degrees[i]);
+  }
  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
+      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks, mem_ptr->luts_array);
+      2 * num_blocks);

  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
+
+
  host_propagate_single_carry<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
      scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -115,13 +115,10 @@ __host__ void host_integer_scalar_mul_radix(
    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
                                             lwe_array, 0, num_radix_blocks);
  } else {
-    for (int i = 0; i < j * num_radix_blocks; i++) {
-      all_shifted_buffer->degrees[i] = message_modulus - 1;
-    }
    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
-        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
+        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true, bsks,
        ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j, nullptr);
+        num_radix_blocks, j);

    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -398,20 +398,32 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
 #if (CUDA_ARCH >= 900)
-  if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+  // On H100s we should be using TBC until num_samples < num_sms / 2.
+  // After that we switch to CG until not supported anymore.
+  // At this point we return to TBC.
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  bool supports_tbc =
+      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, cuda_get_max_shared_memory(gpu_index)))
+          level_count, cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_tbc &&
+      !(input_lwe_ciphertext_count > num_sms / 2 && supports_cg))
    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
-      if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
-              uint64_t>(glwe_dimension, polynomial_size, level_count,
-                        input_lwe_ciphertext_count,
-                        cuda_get_max_shared_memory(gpu_index)))
+      if (supports_cg)
    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -37,18 +37,21 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
  printf("\n");
 }

+
 template <typename T>
-__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
+__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
  for (int i = 0; i < N; i++) {
-    printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
+    T body = src[i * (lwe_dimension + 1) + lwe_dimension];
+    T clear = body / delta;
+    printf("(%lu, %lu), ",  body, clear);
  }
 }

 template <typename T>
-void print_body(const char *name, T *src, int n, int lwe_dimension) {
+void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
  printf("%s: ", name);
  cudaDeviceSynchronize();
-  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
+  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
  cudaDeviceSynchronize();
  printf("\n");
 }
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1018,6 +1018,7 @@ unsafe extern "C" {
        gpu_count: u32,
        radix_lwe_out: *mut CudaRadixCiphertextFFI,
        radix_lwe_vec: *mut CudaRadixCiphertextFFI,
+        reduce_degrees_for_single_carry_propagation: bool,
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
@@ -1316,6 +1317,22 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn trim_radix_blocks_lsb_64(
+        output: *mut CudaRadixCiphertextFFI,
+        input: *const CudaRadixCiphertextFFI,
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+    );
+}
+unsafe extern "C" {
+    pub fn extend_radix_with_trivial_zero_blocks_msb_64(
+        output: *mut CudaRadixCiphertextFFI,
+        input: *const CudaRadixCiphertextFFI,
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+    );
+}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -13,7 +13,7 @@ rust-version = "1.72"

 [dependencies]
 aes = "0.8.2"
-rayon = { workspace = true , optional = true }
+rayon = { workspace = true, optional = true }

 [target.'cfg(target_os = "macos")'.dependencies]
 libc = "0.2.133"
--- a/tfhe-csprng/src/generators/aes_ctr/block_cipher.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/block_cipher.rs
@@ -1,5 +1,4 @@
-use crate::generators::aes_ctr::index::AesIndex;
-use crate::generators::aes_ctr::BYTES_PER_BATCH;
+use crate::generators::aes_ctr::{AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_PER_BATCH};

 /// Represents a key used in the AES block cipher.
 #[derive(Clone, Copy)]
@@ -16,5 +15,7 @@ pub trait AesBlockCipher: Clone + Send + Sync {
    /// Instantiate a new generator from a secret key.
    fn new(key: AesKey) -> Self;
    /// Generates the batch corresponding to the given index.
-    fn generate_batch(&mut self, index: AesIndex) -> [u8; BYTES_PER_BATCH];
+    fn generate_batch(&mut self, data: [u128; AES_CALLS_PER_BATCH]) -> [u8; BYTES_PER_BATCH];
+    /// Generate next bytes
+    fn generate_next(&mut self, data: u128) -> [u8; BYTES_PER_AES_CALL];
 }
--- a/tfhe-csprng/src/generators/aes_ctr/generic.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/generic.rs
@@ -3,6 +3,7 @@ use crate::generators::aes_ctr::index::TableIndex;
 use crate::generators::aes_ctr::states::{BufferPointer, ShiftAction, State};
 use crate::generators::aes_ctr::BYTES_PER_BATCH;
 use crate::generators::{ByteCount, BytesPerChild, ChildrenCount, ForkError};
+use crate::seeders::SeedKind;

 // Usually, to work with iterators and parallel iterators, we would use opaque types such as
 // `impl Iterator<..>`. Unfortunately, it is not yet possible to return existential types in
@@ -82,6 +83,26 @@ impl<BlockCipher: AesBlockCipher> AesCtrGenerator<BlockCipher> {
        }
    }

+    pub(crate) fn from_seed(seed: impl Into<SeedKind>) -> Self {
+        match seed.into() {
+            SeedKind::Ctr(seed) => Self::new(AesKey(seed.0), None, None),
+            SeedKind::Xof(seed) => {
+                let (key, init_index) = super::xof_init(seed);
+                let last_index = TableIndex::LAST.decremented();
+                let state = State::with_offset(TableIndex::SECOND, init_index);
+                let block_cipher = Box::new(BlockCipher::new(key));
+                let buffer = [0u8; BYTES_PER_BATCH];
+
+                Self {
+                    block_cipher,
+                    state,
+                    last: last_index,
+                    buffer,
+                }
+            }
+        }
+    }
+
    /// Returns the table index related to the previous random byte.
    pub fn table_index(&self) -> TableIndex {
        self.state.table_index()
@@ -189,7 +210,9 @@ impl<BlockCipher: AesBlockCipher> Iterator for AesCtrGenerator<BlockCipher> {
            match self.state.increment() {
                ShiftAction::OutputByte(BufferPointer(ptr)) => Some(self.buffer[ptr]),
                ShiftAction::RefreshBatchAndOutputByte(aes_index, BufferPointer(ptr)) => {
-                    self.buffer = self.block_cipher.generate_batch(aes_index);
+                    let aes_inputs =
+                        core::array::from_fn(|i| aes_index.0.wrapping_add(i as u128).to_le());
+                    self.buffer = self.block_cipher.generate_batch(aes_inputs);
                    Some(self.buffer[ptr])
                }
            }
--- a/tfhe-csprng/src/generators/aes_ctr/index.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/index.rs
@@ -56,7 +56,7 @@ impl TableIndex {
        let new_byte_index = self.byte_index.0 + shift_remainder;
        let full_aes_shifts = full_aes_shifts + new_byte_index / BYTES_PER_AES_CALL;

-        // Store the reaminder in the byte index
+        // Store the remainder in the byte index
        self.byte_index.0 = new_byte_index % BYTES_PER_AES_CALL;

        self.aes_index.0 = self.aes_index.0.wrapping_add(full_aes_shifts as u128);
--- a/tfhe-csprng/src/generators/aes_ctr/mod.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/mod.rs
@@ -202,6 +202,7 @@ pub const BYTES_PER_BATCH: usize = BYTES_PER_AES_CALL * AES_CALLS_PER_BATCH;

 /// A module containing structures to manage table indices.
 mod index;
+
 pub use index::*;

 /// A module containing structures to manage table indices and buffer pointers together properly.
@@ -218,5 +219,28 @@ pub use generic::*;
 /// A module extending `generic` to the `rayon` paradigm.
 #[cfg(feature = "parallel")]
 mod parallel;
+
+use crate::seeders::XofSeed;
 #[cfg(feature = "parallel")]
 pub use parallel::*;
+
+pub(crate) fn xof_init(seed: XofSeed) -> (AesKey, AesIndex) {
+    let init_key = AesKey(0);
+    let mut aes = crate::generators::default::DefaultBlockCipher::new(init_key);
+
+    let blocks = seed
+        .iter_u128_blocks()
+        .chain(std::iter::once(seed.bit_len().to_le()));
+
+    let mut prev_c = 0;
+    let mut c = 0;
+    for mi in blocks {
+        prev_c = c;
+        c = u128::from_ne_bytes(aes.generate_next(prev_c ^ mi));
+    }
+
+    let init = AesIndex(prev_c.to_le());
+    let key = AesKey(c);
+
+    (key, init)
+}
--- a/tfhe-csprng/src/generators/aes_ctr/states.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/states.rs
@@ -10,6 +10,7 @@ pub struct BufferPointer(pub usize);
 pub struct State {
    table_index: TableIndex,
    buffer_pointer: BufferPointer,
+    offset: AesIndex,
 }

 /// A structure representing the action to be taken by the generator after shifting its state.
@@ -25,24 +26,42 @@ pub enum ShiftAction {
 impl State {
    /// Creates a new state from the initial table index.
    ///
-    /// Note :
+    /// Note:
    /// ------
    ///
-    /// The `table_index` input, is the __first__ table index that will be outputted on the next
+    /// The `table_index` input is the __first__ table index that will be outputted on the next
    /// call to `increment`. Put differently, the current table index of the newly created state
    /// is the predecessor of this one.
    pub fn new(table_index: TableIndex) -> Self {
+        Self::with_offset(table_index, AesIndex(0))
+    }
+
+    /// Creates a new state from the initial table index and offset
+    ///
+    /// The `offset` AesIndex will be applied to all AES encryption.
+    /// AES(Key, counter + offset).
+    /// This is to be used when one wants to start the AES
+    /// counter at a specific value but still output all the (2^128-1) values
+    ///
+    /// Note:
+    /// ------
+    ///
+    /// The `table_index` input is the __first__ table index that will be outputted on the next
+    /// call to `increment`. Put differently, the current table index of the newly created state
+    /// is the predecessor of this one.
+    pub fn with_offset(table_index: TableIndex, offset: AesIndex) -> Self {
        // We ensure that the table index is not the first one, to prevent wrapping on `decrement`,
        // and outputting `RefreshBatchAndOutputByte(AesIndex::MAX, ...)` on the first increment
-        // (which would lead to loading a non continuous batch).
+        // (which would lead to loading a non-continuous batch).
        assert_ne!(table_index, TableIndex::FIRST);
-        State {
+        Self {
            // To ensure that the first outputted table index is the proper one, we decrement the
            // table index.
            table_index: table_index.decremented(),
            // To ensure that the first `ShiftAction` will be a `RefreshBatchAndOutputByte`, we set
            // the buffer to the last allowed value.
            buffer_pointer: BufferPointer(BYTES_PER_BATCH - 1),
+            offset,
        }
    }

@@ -52,7 +71,8 @@ impl State {
        let total_batch_index = self.buffer_pointer.0 + shift;
        if total_batch_index > BYTES_PER_BATCH - 1 {
            self.buffer_pointer.0 = self.table_index.byte_index.0;
-            ShiftAction::RefreshBatchAndOutputByte(self.table_index.aes_index, self.buffer_pointer)
+            let index = AesIndex(self.table_index.aes_index.0.wrapping_add(self.offset.0));
+            ShiftAction::RefreshBatchAndOutputByte(index, self.buffer_pointer)
        } else {
            self.buffer_pointer.0 = total_batch_index;
            ShiftAction::OutputByte(self.buffer_pointer)
--- a/tfhe-csprng/src/generators/default.rs
+++ b/tfhe-csprng/src/generators/default.rs
@@ -7,3 +7,13 @@ pub type DefaultRandomGenerator = super::NeonAesRandomGenerator;
    not(any(target_arch = "x86_64", target_arch = "aarch64"))
 ))]
 pub type DefaultRandomGenerator = super::SoftwareRandomGenerator;
+
+#[cfg(all(target_arch = "x86_64", not(feature = "software-prng")))]
+pub type DefaultBlockCipher = super::implem::AesniBlockCipher;
+#[cfg(all(target_arch = "aarch64", not(feature = "software-prng")))]
+pub type DefaultBlockCipher = super::implem::ArmAesBlockCipher;
+#[cfg(any(
+    feature = "software-prng",
+    not(any(target_arch = "x86_64", target_arch = "aarch64"))
+))]
+pub type DefaultBlockCipher = super::SoftwareBlockCipher;
--- a/tfhe-csprng/src/generators/implem/aarch64/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/aarch64/block_cipher.rs
@@ -1,4 +1,6 @@
-use crate::generators::aes_ctr::{AesBlockCipher, AesIndex, AesKey, BYTES_PER_BATCH};
+use crate::generators::aes_ctr::{
+    AesBlockCipher, AesKey, AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_PER_BATCH,
+};
 use core::arch::aarch64::{
    uint8x16_t, vaeseq_u8, vaesmcq_u8, vdupq_n_u32, vdupq_n_u8, veorq_u8, vgetq_lane_u32,
    vreinterpretq_u32_u8, vreinterpretq_u8_u32,
@@ -34,24 +36,28 @@ impl AesBlockCipher for ArmAesBlockCipher {
        ArmAesBlockCipher { round_keys }
    }

-    fn generate_batch(&mut self, AesIndex(aes_ctr): AesIndex) -> [u8; BYTES_PER_BATCH] {
+    fn generate_batch(&mut self, data: [u128; AES_CALLS_PER_BATCH]) -> [u8; BYTES_PER_BATCH] {
        #[target_feature(enable = "aes,neon")]
        unsafe fn implementation(
            this: &ArmAesBlockCipher,
-            AesIndex(aes_ctr): AesIndex,
+            data: [u128; AES_CALLS_PER_BATCH],
        ) -> [u8; BYTES_PER_BATCH] {
            let mut output = [0u8; BYTES_PER_BATCH];
            // We want 128 bytes of output, the ctr gives 128 bit message (16 bytes)
-            for (i, out) in output.chunks_exact_mut(16).enumerate() {
+            for (input, out) in data.iter().copied().zip(output.chunks_exact_mut(16)) {
                // Safe because we prevent the user from creating the Generator
                // on non-supported hardware
-                let encrypted = encrypt(aes_ctr + (i as u128), &this.round_keys);
+                let encrypted = encrypt(input, &this.round_keys);
                out.copy_from_slice(&encrypted.to_ne_bytes());
            }
            output
        }
        // SAFETY: we checked for aes and neon availability in `Self::new`
-        unsafe { implementation(self, AesIndex(aes_ctr)) }
+        unsafe { implementation(self, data) }
+    }
+
+    fn generate_next(&mut self, data: u128) -> [u8; BYTES_PER_AES_CALL] {
+        unsafe { encrypt(data, &self.round_keys) }.to_ne_bytes()
    }
 }

--- a/tfhe-csprng/src/generators/implem/aarch64/generator.rs
+++ b/tfhe-csprng/src/generators/implem/aarch64/generator.rs
@@ -1,9 +1,9 @@
-use crate::generators::aes_ctr::{AesCtrGenerator, AesKey, ChildrenIterator};
+use crate::generators::aes_ctr::{AesCtrGenerator, ChildrenIterator};
 use crate::generators::implem::aarch64::block_cipher::ArmAesBlockCipher;
 use crate::generators::{ByteCount, BytesPerChild, ChildrenCount, ForkError, RandomGenerator};
-use crate::seeders::Seed;
+use crate::seeders::SeedKind;

-/// A random number generator using the `aesni` instructions.
+/// A random number generator using the arm `neon` instructions.
 pub struct NeonAesRandomGenerator(pub(super) AesCtrGenerator<ArmAesBlockCipher>);

 /// The children iterator used by [`NeonAesRandomGenerator`].
@@ -21,8 +21,8 @@ impl Iterator for ArmAesChildrenIterator {

 impl RandomGenerator for NeonAesRandomGenerator {
    type ChildrenIter = ArmAesChildrenIterator;
-    fn new(seed: Seed) -> Self {
-        NeonAesRandomGenerator(AesCtrGenerator::new(AesKey(seed.0), None, None))
+    fn new(seed: impl Into<SeedKind>) -> Self {
+        NeonAesRandomGenerator(AesCtrGenerator::from_seed(seed))
    }
    fn remaining_bytes(&self) -> ByteCount {
        self.0.remaining_bytes()
@@ -107,4 +107,14 @@ mod test {
    fn test_bounded_panic() {
        generator_generic_test::test_bounded_none_should_panic::<NeonAesRandomGenerator>();
    }
+
+    #[test]
+    fn test_vector() {
+        generator_generic_test::test_vectors::<NeonAesRandomGenerator>();
+    }
+
+    #[test]
+    fn test_vector_xof_seed() {
+        generator_generic_test::test_vectors_xof_seed::<NeonAesRandomGenerator>();
+    }
 }
--- a/tfhe-csprng/src/generators/implem/aarch64/mod.rs
+++ b/tfhe-csprng/src/generators/implem/aarch64/mod.rs
@@ -6,6 +6,7 @@
 //! [intel aesni white paper 323641-001 revision 3.0](https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf).

 mod block_cipher;
+pub use block_cipher::ArmAesBlockCipher;

 mod generator;
 pub use generator::*;
--- a/tfhe-csprng/src/generators/implem/aesni/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/aesni/block_cipher.rs
@@ -1,4 +1,6 @@
-use crate::generators::aes_ctr::{AesBlockCipher, AesIndex, AesKey, BYTES_PER_BATCH};
+use crate::generators::aes_ctr::{
+    AesBlockCipher, AesKey, AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_PER_BATCH,
+};
 use std::arch::x86_64::{
    __m128i, _mm_aesenc_si128, _mm_aesenclast_si128, _mm_aeskeygenassist_si128, _mm_shuffle_epi32,
    _mm_slli_si128, _mm_store_si128, _mm_xor_si128,
@@ -30,26 +32,30 @@ impl AesBlockCipher for AesniBlockCipher {
        AesniBlockCipher { round_keys }
    }

-    fn generate_batch(&mut self, AesIndex(aes_ctr): AesIndex) -> [u8; BYTES_PER_BATCH] {
+    fn generate_batch(&mut self, data: [u128; AES_CALLS_PER_BATCH]) -> [u8; BYTES_PER_BATCH] {
        #[target_feature(enable = "sse2,aes")]
        unsafe fn implementation(
            this: &AesniBlockCipher,
-            AesIndex(aes_ctr): AesIndex,
+            data: [u128; AES_CALLS_PER_BATCH],
        ) -> [u8; BYTES_PER_BATCH] {
            si128arr_to_u8arr(aes_encrypt_many(
-                u128_to_si128(aes_ctr),
-                u128_to_si128(aes_ctr + 1),
-                u128_to_si128(aes_ctr + 2),
-                u128_to_si128(aes_ctr + 3),
-                u128_to_si128(aes_ctr + 4),
-                u128_to_si128(aes_ctr + 5),
-                u128_to_si128(aes_ctr + 6),
-                u128_to_si128(aes_ctr + 7),
+                u128_to_si128(data[0]),
+                u128_to_si128(data[1]),
+                u128_to_si128(data[2]),
+                u128_to_si128(data[3]),
+                u128_to_si128(data[4]),
+                u128_to_si128(data[5]),
+                u128_to_si128(data[6]),
+                u128_to_si128(data[7]),
                &this.round_keys,
            ))
        }
        // SAFETY: we checked for aes and sse2 availability in `Self::new`
-        unsafe { implementation(self, AesIndex(aes_ctr)) }
+        unsafe { implementation(self, data) }
+    }
+
+    fn generate_next(&mut self, data: u128) -> [u8; BYTES_PER_AES_CALL] {
+        unsafe { transmute(aes_encrypt_one(u128_to_si128(data), &self.round_keys)) }
    }
 }

@@ -61,6 +67,19 @@ unsafe fn generate_round_keys(key: AesKey) -> [__m128i; 11] {
    keys
 }

+#[inline(always)]
+fn aes_encrypt_one(message: __m128i, keys: &[__m128i; 11]) -> __m128i {
+    unsafe {
+        let mut tmp_1 = _mm_xor_si128(message, keys[0]);
+
+        for key in keys.iter().take(10).skip(1) {
+            tmp_1 = _mm_aesenc_si128(tmp_1, *key);
+        }
+
+        _mm_aesenclast_si128(tmp_1, keys[10])
+    }
+}
+
 // Uses aes to encrypt many values at once. This allows a substantial speedup (around 30%)
 // compared to the naive approach.
 #[allow(clippy::too_many_arguments)]
@@ -228,4 +247,14 @@ mod test {
            assert_eq!(CIPHERTEXT, si128_to_u128(*ct));
        }
    }
+
+    #[test]
+    fn test_encrypt_one_message() {
+        let message = u128_to_si128(PLAINTEXT);
+        let key = u128_to_si128(CIPHER_KEY);
+        let mut keys: [__m128i; 11] = [u128_to_si128(0); 11];
+        aes_128_key_expansion(key, &mut keys);
+        let ciphertext = aes_encrypt_one(message, &keys);
+        assert_eq!(CIPHERTEXT, si128_to_u128(ciphertext));
+    }
 }
--- a/tfhe-csprng/src/generators/implem/aesni/generator.rs
+++ b/tfhe-csprng/src/generators/implem/aesni/generator.rs
@@ -1,7 +1,7 @@
-use crate::generators::aes_ctr::{AesCtrGenerator, AesKey, ChildrenIterator};
+use crate::generators::aes_ctr::{AesCtrGenerator, ChildrenIterator};
 use crate::generators::implem::aesni::block_cipher::AesniBlockCipher;
 use crate::generators::{ByteCount, BytesPerChild, ChildrenCount, ForkError, RandomGenerator};
-use crate::seeders::Seed;
+use crate::seeders::SeedKind;

 /// A random number generator using the `aesni` instructions.
 pub struct AesniRandomGenerator(pub(super) AesCtrGenerator<AesniBlockCipher>);
@@ -21,8 +21,8 @@ impl Iterator for AesniChildrenIterator {

 impl RandomGenerator for AesniRandomGenerator {
    type ChildrenIter = AesniChildrenIterator;
-    fn new(seed: Seed) -> Self {
-        AesniRandomGenerator(AesCtrGenerator::new(AesKey(seed.0), None, None))
+    fn new(seed: impl Into<SeedKind>) -> Self {
+        AesniRandomGenerator(AesCtrGenerator::from_seed(seed))
    }
    fn remaining_bytes(&self) -> ByteCount {
        self.0.remaining_bytes()
@@ -107,4 +107,14 @@ mod test {
    fn test_bounded_panic() {
        generator_generic_test::test_bounded_none_should_panic::<AesniRandomGenerator>();
    }
+
+    #[test]
+    fn test_vector() {
+        generator_generic_test::test_vectors::<AesniRandomGenerator>();
+    }
+
+    #[test]
+    fn test_vector_xof_seed() {
+        generator_generic_test::test_vectors_xof_seed::<AesniRandomGenerator>();
+    }
 }
--- a/tfhe-csprng/src/generators/implem/aesni/mod.rs
+++ b/tfhe-csprng/src/generators/implem/aesni/mod.rs
@@ -5,6 +5,7 @@
 //! [intel aesni white paper 323641-001 revision 3.0](https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf).

 mod block_cipher;
+pub use block_cipher::AesniBlockCipher;

 mod generator;
 pub use generator::*;
--- a/tfhe-csprng/src/generators/implem/soft/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/soft/block_cipher.rs
@@ -1,5 +1,5 @@
 use crate::generators::aes_ctr::{
-    AesBlockCipher, AesIndex, AesKey, AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_PER_BATCH,
+    AesBlockCipher, AesKey, AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_PER_BATCH,
 };
 use aes::cipher::generic_array::GenericArray;
 use aes::cipher::{BlockEncrypt, KeyInit};
@@ -19,19 +19,23 @@ impl AesBlockCipher for SoftwareBlockCipher {
        SoftwareBlockCipher { aes }
    }

-    fn generate_batch(&mut self, AesIndex(aes_ctr): AesIndex) -> [u8; BYTES_PER_BATCH] {
+    fn generate_batch(&mut self, data: [u128; AES_CALLS_PER_BATCH]) -> [u8; BYTES_PER_BATCH] {
        aes_encrypt_many(
-            aes_ctr,
-            aes_ctr + 1,
-            aes_ctr + 2,
-            aes_ctr + 3,
-            aes_ctr + 4,
-            aes_ctr + 5,
-            aes_ctr + 6,
-            aes_ctr + 7,
-            &self.aes,
+            data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], &self.aes,
        )
    }
+
+    fn generate_next(&mut self, data: u128) -> [u8; BYTES_PER_AES_CALL] {
+        aes_encrypt_one(data, &self.aes)
+    }
+}
+
+fn aes_encrypt_one(message: u128, cipher: &Aes128) -> [u8; BYTES_PER_AES_CALL] {
+    let mut b1 = GenericArray::clone_from_slice(&message.to_ne_bytes()[..]);
+
+    cipher.encrypt_block(&mut b1);
+
+    b1.into()
 }

 // Uses aes to encrypt many values at once. This allows a substantial speedup (around 30%)
@@ -110,4 +114,12 @@ mod test {
            );
        }
    }
+
+    #[test]
+    fn test_encrypt_one_message() {
+        let key: [u8; BYTES_PER_AES_CALL] = CIPHER_KEY.to_ne_bytes();
+        let aes = Aes128::new(&GenericArray::from(key));
+        let ciphertext = aes_encrypt_one(PLAINTEXT, &aes);
+        assert_eq!(u128::from_ne_bytes(ciphertext), CIPHERTEXT);
+    }
 }
--- a/tfhe-csprng/src/generators/implem/soft/generator.rs
+++ b/tfhe-csprng/src/generators/implem/soft/generator.rs
@@ -1,7 +1,7 @@
-use crate::generators::aes_ctr::{AesCtrGenerator, AesKey, ChildrenIterator};
+use crate::generators::aes_ctr::{AesCtrGenerator, ChildrenIterator};
 use crate::generators::implem::soft::block_cipher::SoftwareBlockCipher;
 use crate::generators::{ByteCount, BytesPerChild, ChildrenCount, ForkError, RandomGenerator};
-use crate::seeders::Seed;
+use crate::seeders::SeedKind;

 /// A random number generator using a software implementation.
 pub struct SoftwareRandomGenerator(pub(super) AesCtrGenerator<SoftwareBlockCipher>);
@@ -21,8 +21,8 @@ impl Iterator for SoftwareChildrenIterator {

 impl RandomGenerator for SoftwareRandomGenerator {
    type ChildrenIter = SoftwareChildrenIterator;
-    fn new(seed: Seed) -> Self {
-        SoftwareRandomGenerator(AesCtrGenerator::new(AesKey(seed.0), None, None))
+    fn new(seed: impl Into<SeedKind>) -> Self {
+        SoftwareRandomGenerator(AesCtrGenerator::from_seed(seed))
    }
    fn remaining_bytes(&self) -> ByteCount {
        self.0.remaining_bytes()
@@ -52,49 +52,57 @@ mod test {
    use crate::generators::aes_ctr::aes_ctr_generic_test;
    use crate::generators::generator_generic_test;

-    #[test]
-    fn prop_fork_first_state_table_index() {
-        aes_ctr_generic_test::prop_fork_first_state_table_index::<SoftwareBlockCipher>();
-    }
+    // We use powerpc64 as the target to test behavior on big-endian
+    // However, we run these tests using an emulator. Thus, these get really slow
+    // so we skip them
+    #[cfg(not(target_arch = "powerpc64"))]
+    mod fork_tests {
+        use super::*;

-    #[test]
-    fn prop_fork_last_bound_table_index() {
-        aes_ctr_generic_test::prop_fork_last_bound_table_index::<SoftwareBlockCipher>();
-    }
+        #[test]
+        fn prop_fork_first_state_table_index() {
+            aes_ctr_generic_test::prop_fork_first_state_table_index::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn prop_fork_parent_bound_table_index() {
-        aes_ctr_generic_test::prop_fork_parent_bound_table_index::<SoftwareBlockCipher>();
-    }
+        #[test]
+        fn prop_fork_last_bound_table_index() {
+            aes_ctr_generic_test::prop_fork_last_bound_table_index::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn prop_fork_parent_state_table_index() {
-        aes_ctr_generic_test::prop_fork_parent_state_table_index::<SoftwareBlockCipher>();
-    }
+        #[test]
+        fn prop_fork_parent_bound_table_index() {
+            aes_ctr_generic_test::prop_fork_parent_bound_table_index::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn prop_fork() {
-        aes_ctr_generic_test::prop_fork::<SoftwareBlockCipher>();
-    }
+        #[test]
+        fn prop_fork_parent_state_table_index() {
+            aes_ctr_generic_test::prop_fork_parent_state_table_index::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn prop_fork_children_remaining_bytes() {
-        aes_ctr_generic_test::prop_fork_children_remaining_bytes::<SoftwareBlockCipher>();
-    }
+        #[test]
+        fn prop_fork() {
+            aes_ctr_generic_test::prop_fork::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn prop_fork_parent_remaining_bytes() {
-        aes_ctr_generic_test::prop_fork_parent_remaining_bytes::<SoftwareBlockCipher>();
-    }
+        #[test]
+        fn prop_fork_children_remaining_bytes() {
+            aes_ctr_generic_test::prop_fork_children_remaining_bytes::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn test_roughly_uniform() {
-        generator_generic_test::test_roughly_uniform::<SoftwareRandomGenerator>();
-    }
+        #[test]
+        fn prop_fork_parent_remaining_bytes() {
+            aes_ctr_generic_test::prop_fork_parent_remaining_bytes::<SoftwareBlockCipher>();
+        }

-    #[test]
-    fn test_fork() {
-        generator_generic_test::test_fork_children::<SoftwareRandomGenerator>();
+        #[test]
+        fn test_fork() {
+            generator_generic_test::test_fork_children::<SoftwareRandomGenerator>();
+        }
+
+        #[test]
+        fn test_roughly_uniform() {
+            generator_generic_test::test_roughly_uniform::<SoftwareRandomGenerator>();
+        }
    }

    #[test]
@@ -107,4 +115,14 @@ mod test {
    fn test_bounded_panic() {
        generator_generic_test::test_bounded_none_should_panic::<SoftwareRandomGenerator>();
    }
+
+    #[test]
+    fn test_vector() {
+        generator_generic_test::test_vectors::<SoftwareRandomGenerator>();
+    }
+
+    #[test]
+    fn test_vector_xof_seed() {
+        generator_generic_test::test_vectors_xof_seed::<SoftwareRandomGenerator>();
+    }
 }
--- a/tfhe-csprng/src/generators/implem/soft/mod.rs
+++ b/tfhe-csprng/src/generators/implem/soft/mod.rs
@@ -1,6 +1,7 @@
 //! A module using a software fallback implementation of random number generator.

 mod block_cipher;
+pub use block_cipher::SoftwareBlockCipher;

 mod generator;
 pub use generator::*;
--- a/tfhe-csprng/src/generators/mod.rs
+++ b/tfhe-csprng/src/generators/mod.rs
@@ -1,7 +1,7 @@
 //! A module containing random generators objects.
 //!
 //! See [crate-level](`crate`) explanations.
-use crate::seeders::Seed;
+use crate::seeders::SeedKind;
 use std::error::Error;
 use std::fmt::{Display, Formatter};

@@ -62,7 +62,7 @@ pub trait RandomGenerator: Iterator<Item = u8> {
    ///
    /// This operation is usually costly to perform, as the aes round keys need to be generated from
    /// the seed.
-    fn new(seed: Seed) -> Self;
+    fn new(seed: impl Into<SeedKind>) -> Self;

    /// Returns the number of bytes that can still be outputted by the generator before reaching its
    /// bound.
@@ -131,6 +131,7 @@ pub use default::DefaultRandomGenerator;
 #[allow(unused)] // to please clippy when tests are not activated
 pub mod generator_generic_test {
    use super::*;
+    use crate::seeders::{Seed, XofSeed};
    use rand::Rng;

    const REPEATS: usize = 1_000;
@@ -236,4 +237,65 @@ pub mod generator_generic_test {
        // One call too many, should panic
        bounded.next().ok_or("expected test panic").unwrap();
    }
+
+    pub fn test_vectors<G: RandomGenerator>() {
+        // Number of random bytes to generate,
+        // this should be 2 batch worth of aes calls (where a batch is 8 aes)
+        const N_BYTES: usize = 16 * 2 * 8;
+
+        const EXPECTED_BYTE: [u8; N_BYTES] = [
+            14, 216, 93, 249, 97, 26, 187, 114, 73, 205, 209, 104, 197, 70, 126, 250, 235, 1, 136,
+            141, 46, 146, 174, 231, 14, 204, 28, 99, 139, 246, 214, 112, 253, 151, 34, 114, 235, 7,
+            76, 37, 36, 154, 226, 148, 68, 238, 117, 87, 212, 183, 174, 200, 222, 153, 62, 48, 166,
+            134, 27, 97, 230, 206, 78, 128, 151, 166, 15, 156, 120, 158, 35, 41, 121, 55, 180, 184,
+            108, 160, 33, 208, 255, 147, 246, 159, 10, 239, 6, 103, 124, 123, 83, 72, 189, 237,
+            225, 36, 30, 151, 134, 94, 211, 181, 108, 239, 137, 18, 246, 237, 233, 59, 61, 24, 111,
+            198, 76, 92, 86, 129, 171, 50, 124, 2, 72, 143, 160, 223, 32, 187, 175, 239, 111, 51,
+            85, 110, 134, 45, 193, 113, 247, 249, 78, 230, 103, 123, 66, 48, 31, 169, 228, 140,
+            202, 168, 202, 199, 147, 89, 135, 104, 254, 198, 72, 31, 103, 236, 207, 138, 24, 100,
+            230, 168, 233, 214, 130, 195, 0, 25, 220, 136, 128, 173, 40, 154, 116, 87, 114, 187,
+            170, 150, 131, 163, 155, 98, 217, 198, 238, 178, 165, 214, 168, 252, 107, 123, 214, 33,
+            17, 114, 35, 23, 172, 145, 5, 39, 16, 33, 92, 163, 132, 240, 167, 128, 226, 165, 80, 9,
+            153, 252, 139, 0, 139, 0, 54, 188, 253, 141, 2, 78, 97, 53, 214, 173, 155, 84, 98, 51,
+            70, 110, 91, 181, 229, 231, 27, 225, 185, 143, 63, 238,
+        ];
+
+        let seed_bytes: [u8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let seed = Seed(u128::from_ne_bytes(seed_bytes));
+
+        let mut rng = G::new(seed);
+        let bytes = rng.take(N_BYTES).collect::<Vec<_>>();
+        assert_eq!(bytes, EXPECTED_BYTE);
+    }
+
+    pub fn test_vectors_xof_seed<G: RandomGenerator>() {
+        // Number of random bytes to generate,
+        // this should be 2 batch worth of aes calls (where a batch is 8 aes)
+        const N_BYTES: usize = 16 * 2 * 8;
+
+        const EXPECTED_BYTE: [u8; N_BYTES] = [
+            134, 231, 117, 200, 60, 174, 158, 95, 80, 64, 236, 147, 204, 196, 251, 198, 110, 155,
+            74, 69, 162, 251, 224, 46, 46, 83, 209, 224, 89, 108, 68, 240, 37, 16, 109, 194, 92, 3,
+            164, 21, 167, 224, 205, 31, 90, 178, 59, 150, 142, 238, 113, 144, 181, 118, 160, 72,
+            187, 38, 29, 61, 189, 229, 66, 22, 4, 38, 210, 63, 232, 182, 115, 49, 96, 6, 120, 226,
+            40, 51, 144, 59, 136, 224, 252, 195, 50, 250, 134, 45, 149, 220, 32, 27, 35, 225, 190,
+            73, 161, 182, 250, 149, 153, 131, 220, 143, 181, 152, 187, 25, 62, 197, 24, 10, 142,
+            57, 172, 15, 17, 244, 242, 232, 51, 50, 244, 85, 58, 69, 28, 113, 151, 143, 138, 166,
+            198, 16, 210, 46, 234, 138, 32, 124, 98, 167, 141, 251, 60, 13, 158, 106, 29, 86, 63,
+            73, 42, 138, 174, 195, 192, 72, 122, 74, 54, 134, 107, 144, 241, 12, 33, 70, 27, 116,
+            154, 123, 1, 252, 141, 73, 79, 30, 162, 43, 57, 8, 99, 62, 222, 117, 232, 147, 81, 189,
+            54, 17, 233, 33, 41, 132, 155, 246, 185, 189, 17, 77, 32, 107, 134, 61, 174, 64, 174,
+            80, 229, 239, 243, 143, 152, 249, 254, 125, 42, 0, 170, 253, 34, 57, 100, 82, 244, 9,
+            101, 126, 138, 218, 215, 55, 58, 177, 154, 5, 28, 113, 89, 123, 129, 254, 212, 191,
+            162, 44, 120, 67, 241, 157, 31, 162, 113, 91,
+        ];
+
+        let seed_bytes: [u8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let seed = u128::from_ne_bytes(seed_bytes);
+        let xof_seed = XofSeed::new_u128(seed, [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']);
+
+        let mut rng = G::new(xof_seed);
+        let bytes = rng.take(N_BYTES).collect::<Vec<_>>();
+        assert_eq!(bytes, EXPECTED_BYTE);
+    }
 }
--- a/tfhe-csprng/src/seeders/implem/apple_secure_enclave_seeder.rs
+++ b/tfhe-csprng/src/seeders/implem/apple_secure_enclave_seeder.rs
@@ -1,7 +1,7 @@
 use crate::seeders::{Seed, Seeder};

 /// There is no `rseed` equivalent in the ARM specification until `ARMv8.5-A`.
-/// However it seems that these instructions are not exposed in `core::arch::aarch64`.
+/// However, it seems that these instructions are not exposed in `core::arch::aarch64`.
 ///
 /// Our primary interest for supporting aarch64 targets is AppleSilicon support
 /// which for the M1 macs available, they are based on the `ARMv8.4-A` set.
--- a/tfhe-csprng/src/seeders/mod.rs
+++ b/tfhe-csprng/src/seeders/mod.rs
@@ -9,6 +9,90 @@
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub struct Seed(pub u128);

+/// A Seed as described in the [NIST document]
+///
+/// This seed contains 2 information:
+/// * The domain separator bytes (ASCII string)
+/// * The seed bytes
+///
+/// [NIST document]: https://eprint.iacr.org/2025/699
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct XofSeed {
+    // We store the domain separator concatenated with the seed bytes (str||seed)
+    // as it makes it easier to create the iterator of u128 blocks
+    data: Vec<u8>,
+}
+
+impl XofSeed {
+    const DOMAIN_SEP_LEN: usize = 8;
+
+    // Creates a new seed of 128 bits
+    pub fn new_u128(seed: u128, domain_separator: [u8; Self::DOMAIN_SEP_LEN]) -> Self {
+        let mut data = vec![0u8; size_of::<u128>() + domain_separator.len()];
+        data[..Self::DOMAIN_SEP_LEN].copy_from_slice(domain_separator.as_slice());
+        data[Self::DOMAIN_SEP_LEN..].copy_from_slice(seed.to_ne_bytes().as_slice());
+
+        Self { data }
+    }
+
+    pub fn new(mut seed: Vec<u8>, domain_separator: [u8; Self::DOMAIN_SEP_LEN]) -> Self {
+        seed.resize(domain_separator.len() + seed.len(), 0);
+        seed.rotate_right(domain_separator.len());
+        seed[..Self::DOMAIN_SEP_LEN].copy_from_slice(domain_separator.as_slice());
+        Self { data: seed }
+    }
+
+    /// Returns the seed part
+    pub fn seed(&self) -> &[u8] {
+        &self.data[Self::DOMAIN_SEP_LEN..]
+    }
+
+    /// Returns the domain separator
+    pub fn domain_separator(&self) -> [u8; Self::DOMAIN_SEP_LEN] {
+        let mut sep = [0u8; Self::DOMAIN_SEP_LEN];
+        sep.copy_from_slice(&self.data[..Self::DOMAIN_SEP_LEN]);
+        sep
+    }
+
+    /// Total len (seed bytes + domain separator) in bits
+    pub fn bit_len(&self) -> u128 {
+        (self.data.len()) as u128 * 8
+    }
+
+    /// Returns an iterator that iterates over the concatenated seed||domain_separator
+    /// as blocks of u128 bits
+    pub(crate) fn iter_u128_blocks(&self) -> impl Iterator<Item = u128> + '_ {
+        self.data.chunks(size_of::<u128>()).map(move |chunk| {
+            let mut buf = [0u8; size_of::<u128>()];
+            buf[..chunk.len()].copy_from_slice(chunk);
+            u128::from_ne_bytes(buf)
+        })
+    }
+}
+
+pub enum SeedKind {
+    /// Initializes the Aes-Ctr with a counter starting at 0
+    /// and uses the seed as the Aes key.
+    Ctr(Seed),
+    /// Seed that initialized the Aes-Ctr following the NIST document (see [XofSeed]).
+    ///
+    /// An Aes-Key and starting counter will be derived from the XofSeed, to
+    /// then initialize the Aes-Ctr random generator
+    Xof(XofSeed),
+}
+
+impl From<Seed> for SeedKind {
+    fn from(value: Seed) -> Self {
+        Self::Ctr(value)
+    }
+}
+
+impl From<XofSeed> for SeedKind {
+    fn from(value: XofSeed) -> Self {
+        Self::Xof(value)
+    }
+}
+
 /// A trait representing a seeding strategy.
 pub trait Seeder {
    /// Generates a new seed.
@@ -29,7 +113,7 @@ pub use implem::*;

 #[cfg(test)]
 mod generic_tests {
-    use crate::seeders::Seeder;
+    use crate::seeders::{Seeder, XofSeed};

    /// Naively verifies that two fixed-size sequences generated by repeatedly calling the seeder
    /// are different.
@@ -47,4 +131,31 @@ mod generic_tests {
            }
        }
    }
+
+    #[test]
+    fn test_xof_seed_getters() {
+        let bits = u128::from_ne_bytes([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
+        let dsep = [b't', b'f', b'h', b'e', b'k', b's', b'p', b's'];
+        let seed = XofSeed::new_u128(bits, dsep);
+
+        let s = u128::from_ne_bytes(seed.seed().try_into().unwrap());
+        assert_eq!(s, bits);
+        assert_eq!(seed.domain_separator(), dsep);
+        assert_eq!(seed.bit_len(), 192);
+
+        let collected_u128s = seed.iter_u128_blocks().collect::<Vec<_>>();
+        assert_eq!(
+            collected_u128s,
+            vec![
+                u128::from_ne_bytes([
+                    b't', b'f', b'h', b'e', b'k', b's', b'p', b's', 1, 2, 3, 4, 5, 6, 7, 8
+                ]),
+                u128::from_ne_bytes([9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]),
+            ]
+        );
+
+        // To make sure both constructors yield the same results
+        let seed2 = XofSeed::new(bits.to_ne_bytes().to_vec(), dsep);
+        assert_eq!(seed.data, seed2.data);
+    }
 }
--- a/tfhe/docs/configuration/hpu_acceleration/run_on_hpu.md
+++ b/tfhe/docs/configuration/hpu_acceleration/run_on_hpu.md
@@ -43,9 +43,9 @@ Comparing to the [CPU example](../../getting_started/quick_start.md), HPU set up
 Here is a full example (combining the client and server parts):

 ```rust
-use tfhe::{ConfigBuilder, set_server_key, FheUint8, ClientKey, CompressedServerKey};
+use tfhe::{Config, set_server_key, FheUint8, ClientKey, CompressedServerKey};
 use tfhe::prelude::*;
-use tfhe_hpu_backend::prelude::*;
+use tfhe::tfhe_hpu_backend::prelude::*;

 fn main() {

@@ -53,7 +53,7 @@ fn main() {
    // HPU configuration knobs are retrieved from a TOML configuration file. Prebuilt configurations could be find in `backends/tfhe-hpu-backend/config_store`
    // For ease of use a setup_hpu.sh script is available in repository root folder and it handle the required environment variables setup and driver initialisation
    // More details are available in `backends/tfhe-hpu-backend/README.md`
-    let hpu_device = HpuDevice::from_config(ShellString::new("${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string()));
+    let hpu_device = HpuDevice::from_config(ShellString::new("${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string()).expand().as_str());

    // Generate keys ----------------------------------------------------------
    let config = Config::from_hpu_device(&hpu_device);
@@ -106,7 +106,7 @@ The server first needs to set up its keys with `set_server_key((hpu_device, comp

 Then, homomorphic computations are performed using the same approach as the [CPU operations](../../fhe-computation/operations/README.md).

-``` rust
+``` Rust
    // Server-side
    let result = a + b;

--- a/tfhe/examples/hpu/bench.rs
+++ b/tfhe/examples/hpu/bench.rs
@@ -260,16 +260,19 @@ pub fn main() {
            let roi_start = Instant::now();

            let res_hpu = (0..args.iter)
-                .map(|_i| {
+                .filter_map(|i| {
                    let res = HpuRadixCiphertext::exec(&proto, iop.opcode(), &srcs_enc, &imms);
-                    std::hint::black_box(&res);
-                    res
+                    if i == (args.iter - 1) {
+                        Some(res)
+                    } else {
+                        None
+                    }
                })
-                .next_back()
-                .expect("Iteration must be greater than 0");
+                .collect::<Vec<_>>();

-            // let res_fhe = $fhe_type::from(res_hpu);
            let res_fhe = res_hpu
+                .last()
+                .expect("Iteration must be greater than 0")
                .iter()
                .map(|x| x.to_radix_ciphertext())
                .collect::<Vec<_>>();
--- a/tfhe/src/high_level_api/booleans/base.rs
+++ b/tfhe/src/high_level_api/booleans/base.rs
@@ -25,6 +25,8 @@ use tfhe_versionable::Versionize;

 #[cfg(feature = "hpu")]
 use crate::integer::hpu::ciphertext::HpuRadixCiphertext;
+#[cfg(feature = "gpu")]
+use crate::prelude::IfThenElseSizeOnGpu;
 #[cfg(feature = "hpu")]
 use tfhe_hpu_backend::prelude::*;

@@ -1769,3 +1771,47 @@ impl std::ops::Not for &FheBool {
        FheBool::new(ciphertext, tag)
    }
 }
+
+#[cfg(feature = "gpu")]
+impl<Id> IfThenElseSizeOnGpu<FheUint<Id>> for FheBool
+where
+    Id: FheUintId,
+{
+    fn get_if_then_else_size_on_gpu(&self, ct_then: &FheUint<Id>, ct_else: &FheUint<Id>) -> u64 {
+        global_state::with_internal_keys(|key| {
+            if let InternalServerKey::Cuda(cuda_key) = key {
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_if_then_else_size_on_gpu(
+                    &CudaBooleanBlock(self.ciphertext.on_gpu(streams).duplicate(streams)),
+                    &*ct_then.ciphertext.on_gpu(streams),
+                    &*ct_else.ciphertext.on_gpu(streams),
+                    streams,
+                )
+            } else {
+                0
+            }
+        })
+    }
+}
+
+#[cfg(feature = "gpu")]
+impl<Id> IfThenElseSizeOnGpu<FheInt<Id>> for FheBool
+where
+    Id: FheIntId,
+{
+    fn get_if_then_else_size_on_gpu(&self, ct_then: &FheInt<Id>, ct_else: &FheInt<Id>) -> u64 {
+        global_state::with_internal_keys(|key| {
+            if let InternalServerKey::Cuda(cuda_key) = key {
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_if_then_else_size_on_gpu(
+                    &CudaBooleanBlock(self.ciphertext.on_gpu(streams).duplicate(streams)),
+                    &*ct_then.ciphertext.on_gpu(streams),
+                    &*ct_else.ciphertext.on_gpu(streams),
+                    streams,
+                )
+            } else {
+                0
+            }
+        })
+    }
+}
--- a/tfhe/src/high_level_api/integers/signed/ops.rs
+++ b/tfhe/src/high_level_api/integers/signed/ops.rs
@@ -1,8 +1,6 @@
 #[cfg(feature = "gpu")]
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::global_state;
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::with_cuda_internal_keys;
 use crate::high_level_api::integers::{FheIntId, FheUintId};
 use crate::high_level_api::keys::InternalServerKey;
 #[cfg(feature = "gpu")]
@@ -2256,14 +2254,12 @@ where
    fn get_gt_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_gt_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_gt_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2272,14 +2268,12 @@ where
    fn get_ge_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_ge_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_ge_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2288,14 +2282,12 @@ where
    fn get_lt_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_lt_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_lt_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2304,14 +2296,12 @@ where
    fn get_le_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_le_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_le_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2326,14 +2316,12 @@ where
    fn get_min_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_min_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_min_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2349,14 +2337,12 @@ where
    fn get_max_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_max_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_max_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2373,14 +2359,12 @@ where
    fn get_left_shift_size_on_gpu(&self, rhs: &FheUint<Id2>) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_left_shift_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_left_shift_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2396,14 +2380,12 @@ where
    fn get_right_shift_size_on_gpu(&self, rhs: &FheUint<Id2>) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_right_shift_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_right_shift_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2419,14 +2401,12 @@ where
    fn get_rotate_left_size_on_gpu(&self, rhs: &FheUint<Id2>) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_rotate_left_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_rotate_left_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2442,14 +2422,12 @@ where
    fn get_rotate_right_size_on_gpu(&self, rhs: &FheUint<Id2>) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_rotate_right_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_rotate_right_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
--- a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs
+++ b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs
@@ -3,7 +3,7 @@ use crate::high_level_api::global_state;
 use crate::high_level_api::integers::FheIntId;
 use crate::high_level_api::keys::InternalServerKey;
 use crate::integer::block_decomposition::DecomposableInto;
-use crate::prelude::{OverflowingAdd, OverflowingMul, OverflowingSub};
+use crate::prelude::{OverflowingAdd, OverflowingMul, OverflowingNeg, OverflowingSub};
 use crate::{FheBool, FheInt};

 impl<Id> OverflowingAdd<Self> for &FheInt<Id>
@@ -537,3 +537,73 @@ where
        <&Self as OverflowingMul<&Self>>::overflowing_mul(&self, other)
    }
 }
+
+impl<Id> OverflowingNeg for &FheInt<Id>
+where
+    Id: FheIntId,
+{
+    type Output = FheInt<Id>;
+
+    /// Negates self, overflowing if this is equal to the minimum value.
+    ///
+    /// * The operation is modular, i.e. on overflow the result wraps around.
+    /// * On overflow the [FheBool] is true (if self encrypts the minimum value), otherwise false
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use tfhe::prelude::*;
+    /// use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheInt16};
+    ///
+    /// let (client_key, server_key) = generate_keys(ConfigBuilder::default());
+    /// set_server_key(server_key);
+    ///
+    /// let a = FheInt16::encrypt(i16::MIN, &client_key);
+    ///
+    /// let (result, overflowed) = a.overflowing_neg();
+    /// let (expected_result, expected_overflowed) = i16::MIN.overflowing_neg();
+    /// let result: i16 = result.decrypt(&client_key);
+    /// assert_eq!(result, expected_result);
+    /// assert_eq!(overflowed.decrypt(&client_key), expected_overflowed);
+    /// assert!(overflowed.decrypt(&client_key));
+    /// ```
+    fn overflowing_neg(self) -> (Self::Output, FheBool) {
+        global_state::with_internal_keys(|key| match key {
+            InternalServerKey::Cpu(cpu_key) => {
+                let (result, overflow) = cpu_key
+                    .pbs_key()
+                    .overflowing_neg_parallelized(&*self.ciphertext.on_cpu());
+                (
+                    FheInt::new(result, cpu_key.tag.clone()),
+                    FheBool::new(overflow, cpu_key.tag.clone()),
+                )
+            }
+            #[cfg(feature = "gpu")]
+            InternalServerKey::Cuda(cuda_key) => {
+                let (result, overflow) = cuda_key.pbs_key().overflowing_neg(
+                    &*self.ciphertext.on_gpu(&cuda_key.streams),
+                    &cuda_key.streams,
+                );
+                (
+                    FheInt::new(result, cuda_key.tag.clone()),
+                    FheBool::new(overflow, cuda_key.tag.clone()),
+                )
+            }
+            #[cfg(feature = "hpu")]
+            InternalServerKey::Hpu(_device) => {
+                panic!("Hpu does not support this overflowing_neg yet.")
+            }
+        })
+    }
+}
+
+impl<Id> OverflowingNeg for FheInt<Id>
+where
+    Id: FheIntId,
+{
+    type Output = Self;
+
+    fn overflowing_neg(self) -> (Self::Output, FheBool) {
+        <&Self as OverflowingNeg>::overflowing_neg(&self)
+    }
+}
--- a/tfhe/src/high_level_api/integers/signed/scalar_ops.rs
+++ b/tfhe/src/high_level_api/integers/signed/scalar_ops.rs
@@ -2,8 +2,6 @@
 use crate::core_crypto::commons::numeric::CastFrom;
 use crate::high_level_api::errors::UnwrapResultExt;
 use crate::high_level_api::global_state;
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::with_cuda_internal_keys;
 use crate::high_level_api::integers::signed::inner::SignedRadixCiphertext;
 use crate::high_level_api::integers::FheIntId;
 use crate::high_level_api::keys::InternalServerKey;
@@ -409,13 +407,11 @@ where
    fn get_gt_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -424,13 +420,11 @@ where
    fn get_ge_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -439,13 +433,11 @@ where
    fn get_lt_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -454,13 +446,11 @@ where
    fn get_le_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -477,13 +467,11 @@ where
    fn get_min_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_min_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_min_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -499,13 +487,11 @@ where
    fn get_max_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_max_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_max_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -634,13 +620,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheInt<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                    if let InternalServerKey::Cuda(cuda_key) = key {
-                        with_cuda_internal_keys(|keys| {
-                            let streams = &keys.streams;
+                            let streams = &cuda_key.streams;
                            cuda_key.key.key.get_scalar_left_shift_size_on_gpu(
                                &*lhs.ciphertext.on_gpu(streams),
                                streams,
                            )
-                        })
                    } else {
                        0
                    })
@@ -692,13 +676,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheInt<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                    if let InternalServerKey::Cuda(cuda_key) = key {
-                        with_cuda_internal_keys(|keys| {
-                            let streams = &keys.streams;
+                            let streams = &cuda_key.streams;
                            cuda_key.key.key.get_scalar_right_shift_size_on_gpu(
                                &*lhs.ciphertext.on_gpu(streams),
                                streams,
                            )
-                        })
                    } else {
                        0
                    })
@@ -750,13 +732,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheInt<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                    if let InternalServerKey::Cuda(cuda_key) = key {
-                        with_cuda_internal_keys(|keys| {
-                            let streams = &keys.streams;
+                            let streams = &cuda_key.streams;
                            cuda_key.key.key.get_scalar_rotate_left_size_on_gpu(
                                &*lhs.ciphertext.on_gpu(streams),
                                streams,
                            )
-                        })
                    } else {
                        0
                    })
@@ -808,13 +788,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheInt<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                    if let InternalServerKey::Cuda(cuda_key) = key {
-                        with_cuda_internal_keys(|keys| {
-                            let streams = &keys.streams;
+                            let streams = &cuda_key.streams;
                            cuda_key.key.key.get_scalar_rotate_right_size_on_gpu(
                                &*lhs.ciphertext.on_gpu(streams),
                                streams,
                            )
-                        })
                    } else {
                        0
                    })
--- a/tfhe/src/high_level_api/integers/signed/tests/gpu.rs
+++ b/tfhe/src/high_level_api/integers/signed/tests/gpu.rs
@@ -4,11 +4,12 @@ use crate::high_level_api::integers::signed::tests::{
 use crate::high_level_api::integers::unsigned::tests::gpu::setup_gpu;
 use crate::prelude::{
    check_valid_cuda_malloc, AddSizeOnGpu, BitAndSizeOnGpu, BitNotSizeOnGpu, BitOrSizeOnGpu,
-    BitXorSizeOnGpu, FheMaxSizeOnGpu, FheMinSizeOnGpu, FheOrdSizeOnGpu, FheTryEncrypt,
-    RotateLeftSizeOnGpu, RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SubSizeOnGpu,
+    BitXorSizeOnGpu, FheEncrypt, FheMaxSizeOnGpu, FheMinSizeOnGpu, FheOrdSizeOnGpu, FheTryEncrypt,
+    IfThenElseSizeOnGpu, RotateLeftSizeOnGpu, RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu,
+    SubSizeOnGpu,
 };
 use crate::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS;
-use crate::{FheInt32, FheUint32, GpuIndex};
+use crate::{FheBool, FheInt32, FheUint32, GpuIndex};
 use rand::Rng;

 #[test]
@@ -291,3 +292,35 @@ fn test_gpu_get_shift_rotate_size_on_gpu() {
        GpuIndex::new(0)
    ));
 }
+
+#[test]
+fn test_gpu_get_if_then_else_size_on_gpu() {
+    let cks = setup_gpu(Some(PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS));
+    let mut rng = rand::thread_rng();
+    let clear_a = rng.gen_range(1..=i32::MAX);
+    let clear_b = rng.gen_range(1..=i32::MAX);
+    let clear_c = rng.gen_range(0..=1);
+    let mut a = FheInt32::try_encrypt(clear_a, &cks).unwrap();
+    let mut b = FheInt32::try_encrypt(clear_b, &cks).unwrap();
+    let c = FheBool::encrypt(clear_c != 0, &cks);
+    a.move_to_current_device();
+    b.move_to_current_device();
+    let a = &a;
+    let b = &b;
+
+    let if_then_else_tmp_buffer_size = c.get_if_then_else_size_on_gpu(a, b);
+    assert!(check_valid_cuda_malloc(
+        if_then_else_tmp_buffer_size,
+        GpuIndex::new(0)
+    ));
+    let select_tmp_buffer_size = c.get_select_size_on_gpu(a, b);
+    assert!(check_valid_cuda_malloc(
+        select_tmp_buffer_size,
+        GpuIndex::new(0)
+    ));
+    let cmux_tmp_buffer_size = c.get_cmux_size_on_gpu(a, b);
+    assert!(check_valid_cuda_malloc(
+        cmux_tmp_buffer_size,
+        GpuIndex::new(0)
+    ));
+}
--- a/tfhe/src/high_level_api/integers/unsigned/ops.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/ops.rs
@@ -6,8 +6,6 @@ use super::inner::RadixCiphertext;
 #[cfg(feature = "gpu")]
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::global_state;
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::with_cuda_internal_keys;
 use crate::high_level_api::integers::FheUintId;
 use crate::high_level_api::keys::InternalServerKey;
 #[cfg(feature = "gpu")]
@@ -2496,14 +2494,12 @@ where
    fn get_gt_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_gt_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_gt_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2512,14 +2508,12 @@ where
    fn get_ge_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_ge_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_ge_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2528,14 +2522,12 @@ where
    fn get_lt_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_lt_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_lt_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2544,14 +2536,12 @@ where
    fn get_le_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_le_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_le_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2566,14 +2556,12 @@ where
    fn get_min_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_min_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_min_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2589,14 +2577,12 @@ where
    fn get_max_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_max_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_max_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2611,14 +2597,12 @@ where
    fn get_left_shift_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_left_shift_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_left_shift_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2633,14 +2617,12 @@ where
    fn get_right_shift_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_right_shift_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_right_shift_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2655,14 +2637,12 @@ where
    fn get_rotate_left_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_rotate_left_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_rotate_left_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
@@ -2677,14 +2657,12 @@ where
    fn get_rotate_right_size_on_gpu(&self, rhs: &Self) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key.key.key.get_rotate_right_size_on_gpu(
-                        &*self.ciphertext.on_gpu(streams),
-                        &rhs.ciphertext.on_gpu(streams),
-                        streams,
-                    )
-                })
+                let streams = &cuda_key.streams;
+                cuda_key.key.key.get_rotate_right_size_on_gpu(
+                    &*self.ciphertext.on_gpu(streams),
+                    &rhs.ciphertext.on_gpu(streams),
+                    streams,
+                )
            } else {
                0
            }
--- a/tfhe/src/high_level_api/integers/unsigned/overflowing_ops.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/overflowing_ops.rs
@@ -3,7 +3,7 @@ use crate::high_level_api::global_state;
 use crate::high_level_api::integers::FheUintId;
 use crate::high_level_api::keys::InternalServerKey;
 use crate::integer::block_decomposition::DecomposableInto;
-use crate::prelude::{CastInto, OverflowingAdd, OverflowingMul, OverflowingSub};
+use crate::prelude::{CastInto, OverflowingAdd, OverflowingMul, OverflowingNeg, OverflowingSub};
 use crate::{FheBool, FheUint};

 impl<Id> OverflowingAdd<Self> for &FheUint<Id>
@@ -530,3 +530,50 @@ where
        <&Self as OverflowingMul<&Self>>::overflowing_mul(&self, other)
    }
 }
+
+impl<Id> OverflowingNeg for &FheUint<Id>
+where
+    Id: FheUintId,
+{
+    type Output = FheUint<Id>;
+
+    fn overflowing_neg(self) -> (Self::Output, FheBool) {
+        global_state::with_internal_keys(|key| match key {
+            InternalServerKey::Cpu(cpu_key) => {
+                let (result, overflow) = cpu_key
+                    .pbs_key()
+                    .overflowing_neg_parallelized(&*self.ciphertext.on_cpu());
+                (
+                    FheUint::new(result, cpu_key.tag.clone()),
+                    FheBool::new(overflow, cpu_key.tag.clone()),
+                )
+            }
+            #[cfg(feature = "gpu")]
+            InternalServerKey::Cuda(cuda_key) => {
+                let (result, overflow) = cuda_key.pbs_key().overflowing_neg(
+                    &*self.ciphertext.on_gpu(&cuda_key.streams),
+                    &cuda_key.streams,
+                );
+                (
+                    FheUint::new(result, cuda_key.tag.clone()),
+                    FheBool::new(overflow, cuda_key.tag.clone()),
+                )
+            }
+            #[cfg(feature = "hpu")]
+            InternalServerKey::Hpu(_device) => {
+                panic!("Hpu does not support this overflowing_neg yet.")
+            }
+        })
+    }
+}
+
+impl<Id> OverflowingNeg for FheUint<Id>
+where
+    Id: FheUintId,
+{
+    type Output = Self;
+
+    fn overflowing_neg(self) -> (Self::Output, FheBool) {
+        <&Self as OverflowingNeg>::overflowing_neg(&self)
+    }
+}
--- a/tfhe/src/high_level_api/integers/unsigned/scalar_ops.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/scalar_ops.rs
@@ -7,8 +7,6 @@ use super::inner::RadixCiphertext;
 use crate::error::InvalidRangeError;
 use crate::high_level_api::errors::UnwrapResultExt;
 use crate::high_level_api::global_state;
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::with_cuda_internal_keys;
 use crate::high_level_api::integers::FheUintId;
 use crate::high_level_api::keys::InternalServerKey;
 #[cfg(feature = "gpu")]
@@ -318,13 +316,11 @@ where
    fn get_gt_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_gt_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_gt_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -333,13 +329,11 @@ where
    fn get_ge_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_ge_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_ge_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -348,13 +342,11 @@ where
    fn get_lt_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_lt_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_lt_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -363,13 +355,11 @@ where
    fn get_le_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_le_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -386,13 +376,11 @@ where
    fn get_min_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_min_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_min_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -408,13 +396,11 @@ where
    fn get_max_size_on_gpu(&self, _rhs: Clear) -> u64 {
        global_state::with_internal_keys(|key| {
            if let InternalServerKey::Cuda(cuda_key) = key {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    cuda_key
-                        .key
-                        .key
-                        .get_scalar_max_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
-                })
+                let streams = &cuda_key.streams;
+                cuda_key
+                    .key
+                    .key
+                    .get_scalar_max_size_on_gpu(&*self.ciphertext.on_gpu(streams), streams)
            } else {
                0
            }
@@ -937,13 +923,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheUint<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                        if let InternalServerKey::Cuda(cuda_key) = key {
-                            with_cuda_internal_keys(|keys| {
-                                let streams = &keys.streams;
+                                let streams = &cuda_key.streams;
                                cuda_key.key.key.get_scalar_left_shift_size_on_gpu(
                                    &*lhs.ciphertext.on_gpu(streams),
                                    streams,
                                )
-                            })
                        } else {
                            0
                        })
@@ -996,13 +980,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheUint<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                        if let InternalServerKey::Cuda(cuda_key) = key {
-                            with_cuda_internal_keys(|keys| {
-                                let streams = &keys.streams;
+                                let streams = &cuda_key.streams;
                                cuda_key.key.key.get_scalar_right_shift_size_on_gpu(
                                    &*lhs.ciphertext.on_gpu(streams),
                                    streams,
                                )
-                            })
                        } else {
                            0
                        })
@@ -1055,13 +1037,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheUint<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                        if let InternalServerKey::Cuda(cuda_key) = key {
-                            with_cuda_internal_keys(|keys| {
-                                let streams = &keys.streams;
+                                let streams = &cuda_key.streams;
                                cuda_key.key.key.get_scalar_rotate_left_size_on_gpu(
                                    &*lhs.ciphertext.on_gpu(streams),
                                    streams,
                                )
-                            })
                        } else {
                            0
                        })
@@ -1114,13 +1094,11 @@ macro_rules! define_scalar_rotate_shifts {
                |lhs: &FheUint<_>, _rhs| {
                    global_state::with_internal_keys(|key|
                        if let InternalServerKey::Cuda(cuda_key) = key {
-                            with_cuda_internal_keys(|keys| {
-                                let streams = &keys.streams;
+                                let streams = &cuda_key.streams;
                                cuda_key.key.key.get_scalar_rotate_right_size_on_gpu(
                                    &*lhs.ciphertext.on_gpu(streams),
                                    streams,
                                )
-                            })
                        } else {
                            0
                        })
--- a/tfhe/src/high_level_api/integers/unsigned/tests/gpu.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/tests/gpu.rs
@@ -1,14 +1,15 @@
 use crate::high_level_api::traits::AddSizeOnGpu;
 use crate::prelude::{
    check_valid_cuda_malloc, BitAndSizeOnGpu, BitNotSizeOnGpu, BitOrSizeOnGpu, BitXorSizeOnGpu,
-    FheMaxSizeOnGpu, FheMinSizeOnGpu, FheOrdSizeOnGpu, FheTryEncrypt, RotateLeftSizeOnGpu,
-    RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SubSizeOnGpu,
+    FheEncrypt, FheMaxSizeOnGpu, FheMinSizeOnGpu, FheOrdSizeOnGpu, FheTryEncrypt,
+    IfThenElseSizeOnGpu, RotateLeftSizeOnGpu, RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu,
+    SubSizeOnGpu,
 };
 use crate::shortint::parameters::{
    TestParameters, PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS,
 };
 use crate::shortint::ClassicPBSParameters;
-use crate::{set_server_key, ClientKey, ConfigBuilder, FheUint32, GpuIndex};
+use crate::{set_server_key, ClientKey, ConfigBuilder, FheBool, FheUint32, GpuIndex};
 use rand::Rng;

 /// GPU setup for tests
@@ -384,3 +385,35 @@ fn test_gpu_get_shift_rotate_size_on_gpu() {
        GpuIndex::new(0)
    ));
 }
+
+#[test]
+fn test_gpu_get_if_then_else_size_on_gpu() {
+    let cks = setup_gpu(Some(PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS));
+    let mut rng = rand::thread_rng();
+    let clear_a = rng.gen_range(1..=u32::MAX);
+    let clear_b = rng.gen_range(1..=u32::MAX);
+    let clear_c = rng.gen_range(0..=1);
+    let mut a = FheUint32::try_encrypt(clear_a, &cks).unwrap();
+    let mut b = FheUint32::try_encrypt(clear_b, &cks).unwrap();
+    let c = FheBool::encrypt(clear_c != 0, &cks);
+    a.move_to_current_device();
+    b.move_to_current_device();
+    let a = &a;
+    let b = &b;
+
+    let if_then_else_tmp_buffer_size = c.get_if_then_else_size_on_gpu(a, b);
+    assert!(check_valid_cuda_malloc(
+        if_then_else_tmp_buffer_size,
+        GpuIndex::new(0)
+    ));
+    let select_tmp_buffer_size = c.get_select_size_on_gpu(a, b);
+    assert!(check_valid_cuda_malloc(
+        select_tmp_buffer_size,
+        GpuIndex::new(0)
+    ));
+    let cmux_tmp_buffer_size = c.get_cmux_size_on_gpu(a, b);
+    assert!(check_valid_cuda_malloc(
+        cmux_tmp_buffer_size,
+        GpuIndex::new(0)
+    ));
+}
--- a/tfhe/src/high_level_api/prelude.rs
+++ b/tfhe/src/high_level_api/prelude.rs
@@ -9,8 +9,8 @@
 pub use crate::high_level_api::traits::{
    BitSlice, CiphertextList, DivRem, FheDecrypt, FheEncrypt, FheEq, FheKeyswitch, FheMax, FheMin,
    FheOrd, FheTrivialEncrypt, FheTryEncrypt, FheTryTrivialEncrypt, FheWait, IfThenElse,
-    OverflowingAdd, OverflowingMul, OverflowingSub, RotateLeft, RotateLeftAssign, RotateRight,
-    RotateRightAssign, ScalarIfThenElse, SquashNoise, Tagged,
+    OverflowingAdd, OverflowingMul, OverflowingNeg, OverflowingSub, RotateLeft, RotateLeftAssign,
+    RotateRight, RotateRightAssign, ScalarIfThenElse, SquashNoise, Tagged,
 };
 #[cfg(feature = "hpu")]
 pub use crate::high_level_api::traits::{FheHpu, HpuHandle};
@@ -27,6 +27,6 @@ pub use crate::high_level_api::strings::traits::*;
 #[cfg(feature = "gpu")]
 pub use crate::high_level_api::traits::{
    AddSizeOnGpu, BitAndSizeOnGpu, BitNotSizeOnGpu, BitOrSizeOnGpu, BitXorSizeOnGpu,
-    FheMaxSizeOnGpu, FheMinSizeOnGpu, FheOrdSizeOnGpu, RotateLeftSizeOnGpu, RotateRightSizeOnGpu,
-    ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
+    FheMaxSizeOnGpu, FheMinSizeOnGpu, FheOrdSizeOnGpu, IfThenElseSizeOnGpu, RotateLeftSizeOnGpu,
+    RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
 };
--- a/tfhe/src/high_level_api/traits.rs
+++ b/tfhe/src/high_level_api/traits.rs
@@ -180,6 +180,12 @@ pub trait OverflowingMul<Rhs> {
    fn overflowing_mul(self, rhs: Rhs) -> (Self::Output, FheBool);
 }

+pub trait OverflowingNeg {
+    type Output;
+
+    fn overflowing_neg(self) -> (Self::Output, FheBool);
+}
+
 pub trait BitSlice<Bounds> {
    type Output;

@@ -302,3 +308,14 @@ pub trait RotateLeftSizeOnGpu<Rhs = Self> {
 pub trait RotateRightSizeOnGpu<Rhs = Self> {
    fn get_rotate_right_size_on_gpu(&self, other: Rhs) -> u64;
 }
+
+#[cfg(feature = "gpu")]
+pub trait IfThenElseSizeOnGpu<Ciphertext> {
+    fn get_if_then_else_size_on_gpu(&self, ct_then: &Ciphertext, ct_else: &Ciphertext) -> u64;
+    fn get_select_size_on_gpu(&self, ct_when_true: &Ciphertext, ct_when_false: &Ciphertext) -> u64 {
+        self.get_if_then_else_size_on_gpu(ct_when_true, ct_when_false)
+    }
+    fn get_cmux_size_on_gpu(&self, ct_then: &Ciphertext, ct_else: &Ciphertext) -> u64 {
+        self.get_if_then_else_size_on_gpu(ct_then, ct_else)
+    }
+}
--- a/tfhe/src/integer/gpu/ciphertext/info.rs
+++ b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -77,40 +77,6 @@ impl CudaRadixCiphertextInfo {
        new_block_info
    }

-    pub(crate) fn after_extend_radix_with_trivial_zero_blocks_msb(
-        &self,
-        num_blocks: usize,
-    ) -> Self {
-        assert!(num_blocks > 0);
-
-        let mut new_block_info = Self {
-            blocks: Vec::with_capacity(self.blocks.len() + num_blocks),
-        };
-        for &b in self.blocks.iter() {
-            new_block_info.blocks.push(b);
-        }
-        for _ in 0..num_blocks {
-            new_block_info.blocks.push(CudaBlockInfo {
-                degree: Degree::new(0),
-                message_modulus: self.blocks.first().unwrap().message_modulus,
-                carry_modulus: self.blocks.first().unwrap().carry_modulus,
-                atomic_pattern: self.blocks.first().unwrap().atomic_pattern,
-                noise_level: NoiseLevel::ZERO,
-            });
-        }
-        new_block_info
-    }
-
-    pub(crate) fn after_trim_radix_blocks_lsb(&self, num_blocks: usize) -> Self {
-        let mut new_block_info = Self {
-            blocks: Vec::with_capacity(self.blocks.len().saturating_sub(num_blocks)),
-        };
-        new_block_info
-            .blocks
-            .extend(self.blocks[num_blocks..].iter().copied());
-        new_block_info
-    }
-
    pub(crate) fn after_trim_radix_blocks_msb(&self, num_blocks: usize) -> Self {
        assert!(num_blocks > 0);

--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -3608,6 +3608,60 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
    update_noise_degree(radix_lwe_out, &cuda_ffi_radix_lwe_out);
 }

+#[allow(clippy::too_many_arguments)]
+pub fn get_cmux_integer_radix_kb_size_on_gpu(
+    streams: &CudaStreams,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
+) -> u64 {
+    let allocate_ms_noise_array = noise_reduction_key.is_some();
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let size_tracker = unsafe {
+        scratch_cuda_integer_radix_cmux_kb_64(
+            streams.ptr.as_ptr(),
+            streams.gpu_indexes_ptr(),
+            streams.len() as u32,
+            std::ptr::addr_of_mut!(mem_ptr),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            big_lwe_dimension.0 as u32,
+            small_lwe_dimension.0 as u32,
+            ks_level.0 as u32,
+            ks_base_log.0 as u32,
+            pbs_level.0 as u32,
+            pbs_base_log.0 as u32,
+            grouping_factor.0 as u32,
+            num_blocks,
+            message_modulus.0 as u32,
+            carry_modulus.0 as u32,
+            pbs_type as u32,
+            false,
+            allocate_ms_noise_array,
+        )
+    };
+    unsafe {
+        cleanup_cuda_integer_radix_cmux(
+            streams.ptr.as_ptr(),
+            streams.gpu_indexes_ptr(),
+            streams.len() as u32,
+            std::ptr::addr_of_mut!(mem_ptr),
+        );
+    }
+    size_tracker
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -3954,6 +4008,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
    streams: &CudaStreams,
    result: &mut CudaRadixCiphertext,
    radix_list: &mut CudaRadixCiphertext,
+    reduce_degrees_for_single_carry_propagation: bool,
    bootstrapping_key: &CudaVec<B>,
    keyswitch_key: &CudaVec<T>,
    message_modulus: MessageModulus,
@@ -4048,6 +4103,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
        streams.len() as u32,
        &raw mut cuda_ffi_result,
        &raw mut cuda_ffi_radix_list,
+        reduce_degrees_for_single_carry_propagation,
        mem_ptr,
        bootstrapping_key.ptr.as_ptr(),
        keyswitch_key.ptr.as_ptr(),
@@ -5387,3 +5443,60 @@ pub unsafe fn unchecked_negate_integer_radix_async(
    );
    update_noise_degree(radix_lwe_out, &cuda_ffi_radix_lwe_out);
 }
+
+/// # Safety
+///
+/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until streams is synchronized
+pub unsafe fn trim_radix_blocks_lsb_async(
+    output: &mut CudaRadixCiphertext,
+    input: &CudaRadixCiphertext,
+    streams: &CudaStreams,
+) {
+    let mut input_degrees = input.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut input_noise_levels = input.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let mut output_degrees = output.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut output_noise_levels = output.info.blocks.iter().map(|b| b.noise_level.0).collect();
+
+    let mut cuda_ffi_output =
+        prepare_cuda_radix_ffi(output, &mut output_degrees, &mut output_noise_levels);
+
+    let cuda_ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels);
+
+    trim_radix_blocks_lsb_64(
+        &raw mut cuda_ffi_output,
+        &raw const cuda_ffi_input,
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+    );
+    update_noise_degree(output, &cuda_ffi_output);
+}
+
+/// # Safety
+///
+/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until streams is synchronized
+pub unsafe fn extend_radix_with_trivial_zero_blocks_msb_async(
+    output: &mut CudaRadixCiphertext,
+    input: &CudaRadixCiphertext,
+    streams: &CudaStreams,
+) {
+    let mut input_degrees = input.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut input_noise_levels = input.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let mut output_degrees = output.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut output_noise_levels = output.info.blocks.iter().map(|b| b.noise_level.0).collect();
+
+    let mut cuda_ffi_output =
+        prepare_cuda_radix_ffi(output, &mut output_degrees, &mut output_noise_levels);
+
+    let cuda_ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels);
+
+    extend_radix_with_trivial_zero_blocks_msb_64(
+        &raw mut cuda_ffi_output,
+        &raw const cuda_ffi_input,
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+    );
+
+    update_noise_degree(output, &cuda_ffi_output);
+}
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -337,6 +337,7 @@ impl CudaServerKey {
        &self,
        result: &mut T,
        ciphertexts: &[T],
+        reduce_degrees_for_single_carry_propagation: bool,
        streams: &CudaStreams,
    ) {
        if ciphertexts.is_empty() {
@@ -377,6 +378,7 @@ impl CudaServerKey {
                    streams,
                    result.as_mut(),
                    &mut terms,
+                    reduce_degrees_for_single_carry_propagation,
                    &d_bsk.d_vec,
                    &self.key_switching_key.d_vec,
                    self.message_modulus,
@@ -402,6 +404,7 @@ impl CudaServerKey {
                    streams,
                    result.as_mut(),
                    &mut terms,
+                    reduce_degrees_for_single_carry_propagation,
                    &d_multibit_bsk.d_vec,
                    &self.key_switching_key.d_vec,
                    self.message_modulus,
@@ -445,7 +448,7 @@ impl CudaServerKey {
        streams: &CudaStreams,
    ) -> T {
        let mut result = self
-            .unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
+            .unchecked_partial_sum_ciphertexts_async(ciphertexts, true, streams)
            .unwrap();

        self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
@@ -458,7 +461,8 @@ impl CudaServerKey {
        ciphertexts: &[T],
        streams: &CudaStreams,
    ) -> Option<T> {
-        let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams) };
+        let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts,
+                                                                           false, streams) };
        streams.synchronize();
        result
    }
@@ -470,6 +474,7 @@ impl CudaServerKey {
    pub unsafe fn unchecked_partial_sum_ciphertexts_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ciphertexts: &[T],
+        reduce_degrees_for_single_carry_propagation: bool,
        streams: &CudaStreams,
    ) -> Option<T> {
        if ciphertexts.is_empty() {
@@ -483,7 +488,8 @@ impl CudaServerKey {
            return Some(result);
        }

-        self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts, streams);
+        self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts,
+                                                            reduce_degrees_for_single_carry_propagation, streams);

        Some(result)
    }
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -5,6 +5,7 @@ use crate::core_crypto::gpu::algorithms::{
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::LweBskGroupingFactor;
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
@@ -96,6 +97,31 @@ impl CudaServerKey {
        ct.as_mut().info = ct.as_ref().info.after_bitnot();
    }

+    pub(crate) unsafe fn unchecked_boolean_bitnot_assign_async(
+        &self,
+        ct: &mut CudaBooleanBlock,
+        streams: &CudaStreams,
+    ) {
+        // We do (-ciphertext) + (msg_mod -1) as it allows to avoid an allocation
+        cuda_lwe_ciphertext_negate_assign(&mut ct.0.as_mut().d_blocks, streams);
+
+        let ct_blocks = ct.0.as_ref().d_blocks.lwe_ciphertext_count().0;
+
+        let shift_plaintext = self.encoding().encode(Cleartext(1u64)).0;
+
+        let scalar_vector = vec![shift_plaintext; ct_blocks];
+        let mut d_decomposed_scalar =
+            CudaVec::<u64>::new_async(ct.0.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0);
+        d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0);
+
+        cuda_lwe_ciphertext_plaintext_add_assign(
+            &mut ct.0.as_mut().d_blocks,
+            &d_decomposed_scalar,
+            streams,
+        );
+        // Neither noise level nor the degree changes
+    }
+
    pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &mut T,
@@ -165,7 +191,7 @@ impl CudaServerKey {
    /// # Safety
    ///
    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
+    ///   not be dropped until streams is synchronized
    pub unsafe fn unchecked_bitop_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
@@ -560,7 +586,7 @@ impl CudaServerKey {
    /// # Safety
    ///
    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
+    ///   not be dropped until streams is synchronized
    pub unsafe fn bitand_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
@@ -666,7 +692,7 @@ impl CudaServerKey {
    /// # Safety
    ///
    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
+    ///   not be dropped until streams is synchronized
    pub unsafe fn bitor_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
@@ -771,7 +797,7 @@ impl CudaServerKey {
    /// # Safety
    ///
    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
+    ///   not be dropped until streams is synchronized
    pub unsafe fn bitxor_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
@@ -869,7 +895,7 @@ impl CudaServerKey {
    /// # Safety
    ///
    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronised
+    ///   not be dropped until streams is synchronized
    pub unsafe fn bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &mut T,
--- a/tfhe/src/integer/gpu/server_key/radix/cmux.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/cmux.rs
@@ -3,7 +3,10 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
-use crate::integer::gpu::{unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType};
+use crate::integer::gpu::{
+    get_cmux_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
+    unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType,
+};

 impl CudaServerKey {
    /// # Safety
@@ -131,4 +134,113 @@ impl CudaServerKey {

        self.unchecked_if_then_else(condition, true_ct, false_ct, stream)
    }
+    pub fn get_if_then_else_size_on_gpu<T: CudaIntegerRadixCiphertext>(
+        &self,
+        _condition: &CudaBooleanBlock,
+        true_ct: &T,
+        false_ct: &T,
+        streams: &CudaStreams,
+    ) -> u64 {
+        assert_eq!(
+            true_ct.as_ref().d_blocks.lwe_dimension(),
+            false_ct.as_ref().d_blocks.lwe_dimension()
+        );
+        assert_eq!(
+            true_ct.as_ref().d_blocks.lwe_ciphertext_count(),
+            false_ct.as_ref().d_blocks.lwe_ciphertext_count()
+        );
+        let full_prop_mem = match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
+                streams,
+                d_bsk.input_lwe_dimension(),
+                d_bsk.glwe_dimension(),
+                d_bsk.polynomial_size(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count(),
+                d_bsk.decomp_base_log(),
+                self.message_modulus,
+                self.carry_modulus,
+                PBSType::Classical,
+                LweBskGroupingFactor(0),
+                d_bsk.d_ms_noise_reduction_key.as_ref(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_multibit_bsk.input_lwe_dimension(),
+                    d_multibit_bsk.glwe_dimension(),
+                    d_multibit_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count(),
+                    d_multibit_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::MultiBit,
+                    d_multibit_bsk.grouping_factor,
+                    None,
+                )
+            }
+        };
+        let actual_full_prop_mem = match (
+            true_ct.block_carries_are_empty(),
+            false_ct.block_carries_are_empty(),
+        ) {
+            (true, true) => 0,
+            (true, false) => self.get_ciphertext_size_on_gpu(true_ct) + full_prop_mem,
+            (false, true) => full_prop_mem,
+            (false, false) => self.get_ciphertext_size_on_gpu(false_ct) + full_prop_mem,
+        };
+
+        let lwe_ciphertext_count = true_ct.as_ref().d_blocks.lwe_ciphertext_count();
+
+        let cmux_mem = match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => get_cmux_integer_radix_kb_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::Classical,
+                LweBskGroupingFactor(0),
+                d_bsk.d_ms_noise_reduction_key.as_ref(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                get_cmux_integer_radix_kb_size_on_gpu(
+                    streams,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    self.key_switching_key
+                        .input_key_lwe_size()
+                        .to_lwe_dimension(),
+                    self.key_switching_key
+                        .output_key_lwe_size()
+                        .to_lwe_dimension(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    lwe_ciphertext_count.0 as u32,
+                    PBSType::MultiBit,
+                    d_multibit_bsk.grouping_factor,
+                    None,
+                )
+            }
+        };
+        actual_full_prop_mem.max(cmux_mem)
+    }
 }
--- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
@@ -93,8 +93,7 @@ impl CudaServerKey {
            },
        );

-        let mut output_cts: T =
-            self.create_trivial_zero_radix_async(num_ct_blocks * num_ct_blocks, streams);
+        let mut output_cts: T = self.create_trivial_zero_radix_async(num_ct_blocks, streams);

        self.compute_prefix_sum_hillis_steele_async(
            output_cts.as_mut(),
@@ -455,7 +454,7 @@ impl CudaServerKey {
        cts.push(new_trivial);

        let result = self
-            .unchecked_partial_sum_ciphertexts_async(&cts, streams)
+            .unchecked_partial_sum_ciphertexts_async(&cts, false, streams)
            .expect("internal error, empty ciphertext count");

        // This is the part where we extract message and carry blocks
@@ -497,28 +496,6 @@ impl CudaServerKey {
            .as_mut_slice(0..lwe_size, 0)
            .unwrap();

-        let mut carry_blocks_last = carry_blocks
-            .as_mut()
-            .d_blocks
-            .0
-            .d_vec
-            .as_mut_slice(
-                lwe_size * (counter_num_blocks - 1)..lwe_size * counter_num_blocks,
-                0,
-            )
-            .unwrap();
-
-        carry_blocks_last.copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
-        carry_blocks.as_mut().info.blocks.last_mut().unwrap().degree =
-            Degree(self.message_modulus.0 - 1);
-        carry_blocks
-            .as_mut()
-            .info
-            .blocks
-            .last_mut()
-            .unwrap()
-            .noise_level = NoiseLevel::ZERO;
-
        self.apply_lookup_table_async(
            carry_blocks.as_mut(),
            result.as_ref(),
@@ -527,10 +504,43 @@ impl CudaServerKey {
            streams,
        );

+        let mut rotated_carry_blocks: CudaSignedRadixCiphertext =
+            self.create_trivial_zero_radix(counter_num_blocks, streams);
+
+        let mut rotated_slice = rotated_carry_blocks
+            .as_mut()
+            .d_blocks
+            .0
+            .d_vec
+            .as_mut_slice(0..(counter_num_blocks) * lwe_size, 0)
+            .unwrap();
+
+        let first_block;
+        let last_blocks;
+        (first_block, last_blocks) = rotated_slice.split_at_mut(lwe_size, 0);
+
+        let mut tmp_carry_blocks3 = carry_blocks.duplicate(streams);
+        let carry_slice = tmp_carry_blocks3
+            .as_mut()
+            .d_blocks
+            .0
+            .d_vec
+            .as_mut_slice(0..(counter_num_blocks - 1) * lwe_size, 0)
+            .unwrap();
+
+        last_blocks
+            .unwrap()
+            .copy_from_gpu_async(&carry_slice, streams, 0);
+        first_block
+            .unwrap()
+            .copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
        let mut ciphertexts = Vec::<CudaSignedRadixCiphertext>::with_capacity(3);

+        for mut info in &mut rotated_carry_blocks.ciphertext.info.blocks {
+            info.degree = Degree(self.message_modulus.0 - 1);
+        }
        ciphertexts.push(message_blocks);
-        ciphertexts.push(carry_blocks);
+        ciphertexts.push(rotated_carry_blocks);

        let trivial_ct: CudaSignedRadixCiphertext =
            self.create_trivial_radix_async(2u32, counter_num_blocks, streams);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Agnes Leroy	071934532e	add back functions	2025-06-04 12:53:45 +02:00
Agnes Leroy	87b41ebbe5	More prints degrees	2025-06-04 12:53:44 +02:00
Agnes Leroy	7ba7dd3e19	Debug with long run tests	2025-06-04 12:53:44 +02:00
Beka Barbakadze	7f122bb435	refactor(gpu): remove debug prints	2025-06-04 12:53:32 +02:00
Beka Barbakadze	56c0811751	fix(gpu): fix ilog2 bug	2025-06-04 12:53:32 +02:00
Beka Barbakadze	580ac5aa99	fix(gpu): mul not using mem reuse	2025-06-04 12:53:32 +02:00
Beka Barbakadze	cf34e9b7f0	refactor(gpu): refactor and optimize sum_ciphertext in cuda backend	2025-06-04 12:53:31 +02:00
David Testé	856fc1a709	chore(ci): ignore stale action refs on rust-toolchain action This action doesn't create releases so the action refs doesn't point to a known tag. If this zizmor findings is not ignored, then continuous integration pipeline is broken.	2025-06-04 11:48:01 +02:00
Pedro Alves	fe0a195630	chore(gpu): switches from the TBC PBS to the other variants for many inputs	2025-06-04 05:45:53 -03:00
tmontaigu	aca7e79585	feat(csprng): add Xof random generation This adds a new kind of seed to the csprng When created which such seed, the AES-CTR random generator initialization changes: - The AES-KEY used is initialized differently - The AES-CTR starts with a CTR that may not be 0 The changes make it so that the counter still goes from 0..MAX, but now the AES-CTR will encrypt the counter + some offset allowing to keep the regular behavior and the new one	2025-06-04 09:57:18 +02:00
tmontaigu	c0e89a53ef	fix(csprng): fix and endian for the counter This commit fixes an endian (little) for the counter representation of the counter used in the AES-CTR counter. This is so that, the random bytes generated are the same not matter the endian of the system. A test case with known answers is added, as well as make command to run the test in an emulated big-endian arch using the `cross` utility. This also include a small refactor where now the block cipher do not encrypt `AesIndex`. This is done as it makes more sense (AES encrypts bytes, not numbers), so this allows to move and centralize the concept of endian as well a centralize where batch created.	2025-06-04 09:57:18 +02:00
David Testé	312952007f	chore(ci): lock zizmor version to avoid breaking ci pipelines Newer version of Zizmor can trigger errors due to new findings in workflows. To avoid breaking any ongoing pull-request, due to this unhandled update, zizmor version is locked.	2025-06-03 12:29:36 +02:00
Enzo Di Maria	ff51ed3f34	refactor(gpu): moving trim_radix_blocks_lsb_async to backend	2025-06-03 11:42:18 +02:00
Agnes Leroy	9737bdcb98	fix(gpu): fix degrees after bitxor	2025-06-03 08:47:12 +02:00
tmontaigu	87a43a4900	chore(integer): add determinism check for sum	2025-06-02 17:37:21 +02:00
Agnes Leroy	345bdbf17f	feat(gpu): add memory tracking function for cmux	2025-06-02 17:29:17 +02:00
Agnes Leroy	cc54ba2236	chore(gpu): fix overflow in div in long run tests	2025-06-02 17:05:09 +02:00
David Testé	11df6c69ee	chore(ci): fix workflow security warnings Since Zizmor v1.9.0, new pedantic warnings are detected especially regarding template-injection patterns.	2025-06-02 14:46:14 +02:00
Guillermo Oyarzun	b76f4dbfe0	fix(gpu): fix hardcoded use of message modulus	2025-06-02 10:43:14 +02:00
Enzo Di Maria	be21c15c80	refactor(gpu): moving extend_radix_with_trivial_zero_blocks_msb to backend	2025-06-02 09:19:51 +02:00
tmontaigu	aa51b25313	chore(ci): fix test_user_docs run and add hpu Due to #[cfg] before the test_user_docs module, the module would not actually be compiled (thus run user doc test) unless all required features where activated when running. So we remove these cfg, as each hardware doc supports its own set of features and its better to have a test fail because a feature is missing rather than silently not run anything Also, add commands and ci stuff to check HPU docs	2025-05-30 16:36:56 +02:00
tmontaigu	300c95fe3d	fix(doc): finish HPU example fix	2025-05-30 16:36:56 +02:00
pgardratzama	524adda8f6	fix(doc): hpu example was not compiling	2025-05-30 16:36:56 +02:00
tmontaigu	dedcf205b4	feat(integer): improve default neg	2025-05-30 15:02:35 +02:00
tmontaigu	2c8d4c0fb0	feat(hlapi): add overflowing_neg	2025-05-30 15:02:35 +02:00
tmontaigu	3370fb5b7e	feat(gpu): add overflowing_neg	2025-05-30 15:02:35 +02:00
tmontaigu	cd77eac42b	feat(integer): add overflowing_neg	2025-05-30 15:02:35 +02:00
Baptiste Roux	40f20b4ecb	fix(hpu): Rewrite hpu_bench iteration loop hpu_bench example was wrong for iter > 1 following clippy modifications. NB: Vector is collect but intermediate value are explicitly drop to enable long-time stressed tests.	2025-05-28 14:45:45 +02:00