refactor(gpu): random test fft

fix(gpu): fix gpu index in casts, scalar comparison, scalar mul, etc.
feat(gpu): Implement propagate_single_carry_get_input_carries
2026-04-28 03:01:21 -04:00 · 2024-06-27 10:10:13 +02:00 · 2024-06-27 10:08:11 +02:00 · 2024-06-26 17:34:28 +02:00 · 2024-06-26 16:47:29 +02:00 · 2024-06-26 11:30:05 +02:00
44 changed files with 1287 additions and 605 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -3,6 +3,7 @@ self-hosted-runner:
  labels:
    - m1mac
    - 4090-desktop
+    - large_windows_16_latest
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_multi_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_multi_gpu_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -129,7 +129,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -235,7 +235,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,7 +90,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -19,7 +19,7 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest-large, windows-latest]
+        os: [ubuntu-latest, macos-latest-large, large_windows_16_latest]
      fail-fast: false

    steps:
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -125,7 +125,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -27,7 +27,7 @@ jobs:
    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
-      cancel-in-progress: true
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ["self-hosted", "4090-desktop"]
    timeout-minutes: 1440 # 24 hours
    strategy:
@@ -114,7 +114,7 @@ jobs:
    needs: cuda-integer-benchmarks
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
-      cancel-in-progress: true
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ["self-hosted", "4090-desktop"]
    timeout-minutes: 1440 # 24 hours

--- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -106,6 +106,10 @@ jobs:
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

+      - name:
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
@@ -143,7 +147,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -180,7 +180,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -189,7 +189,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -28,7 +28,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -40,8 +40,9 @@ jobs:
          fetch-depth: 0

      - name: Create NPM version tag
+        if: ${{ inputs.npm_latest_tag }}
        run: |
-          echo "NPM_TAG=$(sed -n -e '1,/^version/p' tfhe/Cargo.toml | grep '^version[[:space:]]*=' | cut -d '=' -f 2 | tr -d ' ')" >> "${GITHUB_ENV}"
+          echo "NPM_TAG=latest" >> "${GITHUB_ENV}"

      - name: Publish crate.io package
        if: ${{ inputs.push_to_crates }}
@@ -65,14 +66,6 @@ jobs:
          dry-run: ${{ inputs.dry_run }}
          tag: ${{ env.NPM_TAG }}

-      - name: Publish web package as latest
-        if: ${{ inputs.push_web_package && inputs.npm_latest_tag }}
-        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
-        with:
-          token: ${{ secrets.NPM_TOKEN }}
-          package: tfhe/pkg/package.json
-          dry-run: ${{ inputs.dry_run }}
-
      - name: Build Node package
        if: ${{ inputs.push_node_package }}
        run: |
@@ -90,14 +83,6 @@ jobs:
          dry-run: ${{ inputs.dry_run }}
          tag: ${{ env.NPM_TAG }}

-      - name: Publish Node package as latest
-        if: ${{ inputs.push_node_package && inputs.npm_latest_tag }}
-        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
-        with:
-          token: ${{ secrets.NPM_TOKEN }}
-          package: tfhe/pkg/package.json
-          dry-run: ${{ inputs.dry_run }}
-
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/zk_pke_benchmark.yml
+++ b/.github/workflows/zk_pke_benchmark.yml
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -82,7 +82,7 @@ jobs:
    needs: [ should-run, setup-instance ]
    concurrency:
      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
-      cancel-in-progress: true
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
@@ -182,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -272,6 +272,11 @@ void cuda_propagate_single_carry_kb_64_inplace(
    void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t num_blocks);

+void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
+    void **ksks, uint32_t num_blocks);
+
 void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -294,9 +294,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
    __syncthreads();
  }

-  // compressed size = 8192 is actual polynomial size = 16384.
-  // from this size, twiddles can't fit in constant memory,
-  // so from here, butterfly operation access device memory.
  if constexpr (params::degree >= 8192) {
    // level 13
    tid = threadIdx.x;
@@ -307,7 +304,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
           (tid & (params::degree / 8192 - 1));
      i2 = i1 + params::degree / 8192;

-      w = negtwiddles13[twid_id];
+      w = negtwiddles[twid_id + 4096];
      u = A[i1];
      v = A[i2] * w;

@@ -351,10 +348,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  // mapping in backward fft is reversed
  // butterfly operation is started from last level

-  // compressed size = 8192 is actual polynomial size = 16384.
-  // twiddles for this size can't fit in constant memory so
-  // butterfly operation for this level access device memory to fetch
-  // twiddles
  if constexpr (params::degree >= 8192) {
    // level 13
    tid = threadIdx.x;
@@ -365,7 +358,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
           (tid & (params::degree / 8192 - 1));
      i2 = i1 + params::degree / 8192;

-      w = negtwiddles13[twid_id];
+      w = negtwiddles[twid_id + 4096];
      u = A[i1] - A[i2];

      A[i1] += A[i2];
@@ -722,4 +715,312 @@ __global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
  }
 }

+template <class params> __device__ void NSMFFT_direct_bundle(double2 *A, const double2 regs[4]) {
+
+  /* We don't make bit reverse here, since twiddles are already reversed
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance
+   */
+
+  size_t tid = threadIdx.x;
+  size_t twid_id;
+  size_t i1, i2;
+  double2 u, v, w;
+  // level 1
+  // we don't make actual complex multiplication on level1 since we have only
+  // one twiddle, it's real and image parts are equal, so we can multiply
+  // it with simpler operations
+  // degree = 1024, opt = 2 -> 
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    i1 = tid;
+    i2 = tid + params::degree / 2;
+
+    //u = A[i1];
+    //v = A[i2] * (double2){0.707106781186547461715008466854,
+    //                      0.707106781186547461715008466854};
+
+    u = regs[i];
+    v = regs[i + params::opt / 2] * (double2){0.707106781186547461715008466854,
+                          0.707106781186547461715008466854};
+    //A[i1] += v;
+    A[i1] = u + v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt; //256
+  }
+  
+  __syncthreads();
+
+  // level 2
+  // from this level there are more than one twiddles and none of them has equal
+  // real and imag parts, so complete complex multiplication is needed
+  // for each level params::degree / 2^level represents number of coefficients
+  // inside divided chunk of specific level
+  //
+#pragma unroll
+  for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+    tid = threadIdx.x + i * params::degree / params::opt;
+    twid_id = tid / (params::degree / 4);
+    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
+    i2 = i1 + params::degree / 4;
+
+    w = negtwiddles[twid_id + 2];
+    u = A[i1];
+    v = A[i2] * w;
+  
+    A[i1] += v;
+    A[i2] = u - v;
+  }
+  __syncthreads();
+
+  // level 3
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 8);
+    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
+    i2 = i1 + params::degree / 8;
+
+    w = negtwiddles[twid_id + 4];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 4
+  //tid = threadIdx.x;
+  //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+  for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+    tid = threadIdx.x + i * params::degree / params::opt;
+    twid_id = tid / (params::degree / 16);
+    i1 =
+        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
+    i2 = i1 + params::degree / 16;
+
+    w = negtwiddles[twid_id + 8];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    //tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 5
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 32);
+    i1 =
+        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
+    i2 = i1 + params::degree / 32;
+
+    w = negtwiddles[twid_id + 16];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 6
+  //tid = threadIdx.x;
+  //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+  for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+    tid = threadIdx.x + i * params::degree / params::opt;
+    twid_id = tid / (params::degree / 64);
+    i1 =
+        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
+    i2 = i1 + params::degree / 64;
+
+    w = negtwiddles[twid_id + 32];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+    //tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 7
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 128);
+    i1 = 2 * (params::degree / 128) * twid_id +
+         (tid & (params::degree / 128 - 1));
+    i2 = i1 + params::degree / 128;
+
+    w = negtwiddles[twid_id + 64];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // from level 8, we need to check size of params degree, because we support
+  // minimum actual polynomial size = 256,  when compressed size is halfed and
+  // minimum supported compressed size is 128, so we always need first 7
+  // levels of butterfly operation, since butterfly levels are hardcoded
+  // we need to check if polynomial size is big enough to require specific level
+  // of butterfly.
+  if constexpr (params::degree >= 256) {
+    // level 8
+    //tid = threadIdx.x;
+    //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+    for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+      tid = threadIdx.x + i * params::degree / params::opt;
+      twid_id = tid / (params::degree / 256);
+      i1 = 2 * (params::degree / 256) * twid_id +
+           (tid & (params::degree / 256 - 1));
+      i2 = i1 + params::degree / 256;
+
+      w = negtwiddles[twid_id + 128];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      //tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 512) {
+    // level 9
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 512);
+      i1 = 2 * (params::degree / 512) * twid_id +
+           (tid & (params::degree / 512 - 1));
+      i2 = i1 + params::degree / 512;
+
+      w = negtwiddles[twid_id + 256];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 1024) {
+    // level 10
+    //tid = threadIdx.x;
+    //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+    for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+      tid = threadIdx.x + i * params::degree / params::opt;
+      twid_id = tid / (params::degree / 1024);
+      i1 = 2 * (params::degree / 1024) * twid_id +
+           (tid & (params::degree / 1024 - 1));
+      i2 = i1 + params::degree / 1024;
+
+      w = negtwiddles[twid_id + 512];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      //tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 2048) {
+    // level 11
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 2048);
+      i1 = 2 * (params::degree / 2048) * twid_id +
+           (tid & (params::degree / 2048 - 1));
+      i2 = i1 + params::degree / 2048;
+
+      w = negtwiddles[twid_id + 1024];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 4096) {
+    // level 12
+    //tid = threadIdx.x;
+    //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+    for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+      tid = threadIdx.x + i * params::degree / params::opt;
+      twid_id = tid / (params::degree / 4096);
+      i1 = 2 * (params::degree / 4096) * twid_id +
+           (tid & (params::degree / 4096 - 1));
+      i2 = i1 + params::degree / 4096;
+
+      w = negtwiddles[twid_id + 2048];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      //tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 8192) {
+    // level 13
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 8192);
+      i1 = 2 * (params::degree / 8192) * twid_id +
+           (tid & (params::degree / 8192 - 1));
+      i2 = i1 + params::degree / 8192;
+
+      w = negtwiddles[twid_id + 4096];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+}
+
 #endif // GPU_BOOTSTRAP_FFT_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
@@ -1,6 +1,6 @@
 #include "cuComplex.h"

-__constant__ double2 negtwiddles[4096] = {
+__device__ double2 negtwiddles[8192] = {
    {0, 0},
    {0.707106781186547461715008466854, 0.707106781186547572737310929369},
    {0.92387953251128673848313610506, 0.382683432365089781779232680492},
@@ -4096,9 +4096,7 @@ __constant__ double2 negtwiddles[4096] = {
    {0.70791982920081630847874976098, 0.706292797233758484765075991163},
    {-0.706292797233758484765075991163, 0.70791982920081630847874976098},
    {0.00115048533711384847431913325266, 0.99999933819152553304832053982},
-    {-0.99999933819152553304832053982, 0.00115048533711384847431913325266}};
-
-__device__ double2 negtwiddles13[4096] = {
+    {-0.99999933819152553304832053982, 0.00115048533711384847431913325266},
    {0.999999981616429334252416083473, 0.000191747597310703291528452552051},
    {-0.000191747597310703291528452552051, 0.999999981616429334252416083473},
    {0.706971182161065359039753275283, 0.707242354213734603085583785287},
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
@@ -2,12 +2,7 @@
 #define GPU_BOOTSTRAP_TWIDDLES_CUH

 /*
- * 'negtwiddles' are stored in constant memory for faster access times
- * because of it's limited size, only twiddles for up to 2^12 polynomial size
- * can be stored there, twiddles for 2^13 are stored in device memory
- * 'negtwiddles13'
+ * 'negtwiddles' are stored in device memory to profit caching
 */
-
-extern __constant__ double2 negtwiddles[4096];
-extern __device__ double2 negtwiddles13[4096];
+extern __device__ double2 negtwiddles[8192];
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -68,6 +68,18 @@ void cuda_propagate_single_carry_kb_64_inplace(
  host_propagate_single_carry<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
+}
+
+void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
+    void **ksks, uint32_t num_blocks) {
+  host_propagate_single_carry<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      static_cast<uint64_t *>(input_carries),
      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      num_blocks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -427,7 +427,7 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
 template <typename Torus>
 void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 uint32_t gpu_count, Torus *lwe_array,
-                                 Torus *carry_out,
+                                 Torus *carry_out, Torus *input_carries,
                                 int_sc_prop_memory<Torus> *mem, void **bsks,
                                 Torus **ksks, uint32_t num_blocks) {
  auto params = mem->params;
@@ -482,6 +482,12 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                    gpu_indexes[0]);

+  if (input_carries != nullptr) {
+    cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
+                                 big_lwe_size_bytes * num_blocks, streams[0],
+                                 gpu_indexes[0]);
+  }
+
  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
                glwe_dimension * polynomial_size, num_blocks);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -368,8 +368,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
                num_blocks);

  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-                                     radix_lwe_out, nullptr, mem_ptr->scp_mem,
-                                     bsks, ksks, num_blocks);
+                                     radix_lwe_out, nullptr, nullptr,
+                                     mem_ptr->scp_mem, bsks, ksks, num_blocks);
 }

 template <typename Torus, typename STorus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -84,6 +84,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
    Torus *bsk_poly = bsk_slice + poly_id * params::degree;

+    // opt = 8 degree/opt = 256
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
        bsk_poly, accumulator);

@@ -114,6 +115,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    // Move accumulator to local memory
    double2 temp[params::opt / 2];
    int tid = threadIdx.x;
+    //opt = 8 degree=2048  degree/opt =256
 #pragma unroll
    for (int i = 0; i < params::opt / 2; i++) {
      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
@@ -123,17 +125,22 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
      temp[i].y /= (double)std::numeric_limits<Torus>::max();
      tid += params::degree / params::opt;
    }
-
+/*
    synchronize_threads_in_block();
+    
    // Move from local memory back to shared memory but as complex
    tid = threadIdx.x;
+    //Loop for 4 times ... temp[4]
+
 #pragma unroll
    for (int i = 0; i < params::opt / 2; i++) {
      fft[tid] = temp[i];
-      tid += params::degree / params::opt;
+      tid += params::degree / params::opt; // degree 2048 opt 8  degree/opt = 256
    }
    synchronize_threads_in_block();
    NSMFFT_direct<HalfDegree<params>>(fft);
+*/
+    NSMFFT_direct_bundle<HalfDegree<params>>(fft, temp);

    // lwe iteration
    auto keybundle_out = get_ith_mask_kth_block(
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -1068,6 +1068,19 @@ extern "C" {
        num_blocks: u32,
    );

+    pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        radix_lwe: *mut c_void,
+        carry_out: *mut c_void,
+        input_carries: *mut c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        num_blocks: u32,
+    );
+
    pub fn cleanup_cuda_propagate_single_carry(
        streams: *const *mut c_void,
        gpu_indexes: *const u32,
--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -13,9 +13,9 @@ description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryp

 [dependencies]
 ark-bls12-381 = { package = "tfhe-ark-bls12-381", version = "0.4.0" }
-ark-ec = { package = "tfhe-ark-ec", version = "0.4.2" }
-ark-ff = { package = "tfhe-ark-ff", version = "0.4.3" }
-ark-poly = { package = "tfhe-ark-poly", version = "0.4.2" }
+ark-ec = { package = "tfhe-ark-ec", version = "0.4.2", features = ["parallel"] }
+ark-ff = { package = "tfhe-ark-ff", version = "0.4.3", features = ["parallel"] }
+ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = ["parallel"] }
 ark-serialize = { version = "0.4.2" }
 rand = "0.8.5"
 rayon = "1.8.0"
--- a/tfhe-zk-pok/src/curve_api/bls12_446.rs
+++ b/tfhe-zk-pok/src/curve_api/bls12_446.rs
@@ -242,6 +242,96 @@ mod g2 {
                .unwrap(),
            }
        }
+
+        // m is an intermediate variable that's used in both the curve point addition and pairing
+        // functions. we cache it since it requires a Zp division
+        // https://hackmd.io/@tazAymRSQCGXTUKkbh1BAg/Sk27liTW9#Math-Formula-for-Point-Addition
+        pub(crate) fn compute_m(self, other: G2Affine) -> Option<crate::curve_446::Fq2> {
+            let zero = crate::curve_446::Fq2::ZERO;
+
+            // in the context of elliptic curves, the point at infinity is the zero element of the
+            // group
+            if self.inner.infinity || other.inner.infinity {
+                return None;
+            }
+
+            if self == other {
+                let x = self.inner.x;
+                let y = self.inner.y;
+                if y == zero {
+                    None
+                } else {
+                    let xx = x.square();
+                    Some((xx.double() + xx) / y.double())
+                }
+            } else {
+                let x1 = self.inner.x;
+                let y1 = self.inner.y;
+                let x2 = other.inner.x;
+                let y2 = other.inner.y;
+
+                let x_delta = x2 - x1;
+                let y_delta = y2 - y1;
+
+                if x_delta == zero {
+                    None
+                } else {
+                    Some(y_delta / x_delta)
+                }
+            }
+        }
+
+        pub(crate) fn double(self, m: Option<crate::curve_446::Fq2>) -> Self {
+            // in the context of elliptic curves, the point at infinity is the zero element of the
+            // group
+            if self.inner.infinity {
+                return self;
+            }
+
+            let mut result = self;
+
+            let x = self.inner.x;
+            let y = self.inner.y;
+
+            if let Some(m) = m {
+                let x3 = m.square() - x.double();
+                let y3 = m * (x - x3) - y;
+
+                (result.inner.x, result.inner.y) = (x3, y3);
+            } else {
+                result.inner.infinity = true;
+            }
+
+            result
+        }
+
+        pub(crate) fn add_unequal(self, other: G2Affine, m: Option<crate::curve_446::Fq2>) -> Self {
+            // in the context of elliptic curves, the point at infinity is the zero element of the
+            // group
+            if self.inner.infinity {
+                return other;
+            }
+            if other.inner.infinity {
+                return self;
+            }
+
+            let mut result = self;
+
+            let x1 = self.inner.x;
+            let y1 = self.inner.y;
+            let x2 = other.inner.x;
+
+            if let Some(m) = m {
+                let x3 = m.square() - x1 - x2;
+                let y3 = m * (x1 - x3) - y1;
+
+                (result.inner.x, result.inner.y) = (x3, y3);
+            } else {
+                result.inner.infinity = true;
+            }
+
+            result
+        }
    }

    #[derive(
@@ -373,9 +463,9 @@ mod g2 {
        }

        pub fn double(self) -> Self {
-            Self {
-                inner: self.inner.double(),
-            }
+            let mut this = self;
+            this.inner.double_in_place();
+            this
        }
    }

@@ -431,51 +521,79 @@ mod g2 {
 }

 mod gt {
+    use crate::curve_446::{Fq, Fq12, Fq2};
+
    use super::*;
-    use ark_ec::bls12::Bls12Config;
    use ark_ec::pairing::{MillerLoopOutput, Pairing};
-    use ark_ff::{CubicExtField, Fp12, Fp2, QuadExtField};
+    use ark_ff::{CubicExtField, QuadExtField};

    type Bls = crate::curve_446::Bls12_446;
-    type Config = crate::curve_446::Config;

-    const ONE: Fp2<<Config as Bls12Config>::Fp2Config> = QuadExtField {
-        c0: MontFp!("1"),
-        c1: MontFp!("0"),
-    };
-    const ZERO: Fp2<<Config as Bls12Config>::Fp2Config> = QuadExtField {
+    const ZERO: Fq2 = QuadExtField {
        c0: MontFp!("0"),
        c1: MontFp!("0"),
    };

-    const U1: Fp12<<Config as Bls12Config>::Fp12Config> = QuadExtField {
-        c0: CubicExtField {
-            c0: ZERO,
-            c1: ZERO,
-            c2: ZERO,
-        },
-        c1: CubicExtField {
-            c0: ONE,
-            c1: ZERO,
-            c2: ZERO,
-        },
+    // computed by copying the result from
+    // let two: Fq = MontFp!("2"); println!("{}", two.inverse().unwrap()), which we can't compute in
+    // a const context;
+    const TWO_INV: Fq = {
+        MontFp!("86412351771428577990035638289747981121746346761394949218917418178192828331138736448451251370148591845087981000773214233672031082665302")
    };
-    const U3: Fp12<<Config as Bls12Config>::Fp12Config> = QuadExtField {
-        c0: CubicExtField {
-            c0: ZERO,
-            c1: ZERO,
-            c2: ZERO,
-        },
-        c1: CubicExtField {
-            c0: ZERO,
-            c1: ONE,
-            c2: ZERO,
-        },
+    const TWO_INV_MINUS_1: Fq = {
+        MontFp!("86412351771428577990035638289747981121746346761394949218917418178192828331138736448451251370148591845087981000773214233672031082665301")
    };

-    const fn fp2_to_fp12(
-        x: Fp2<<Config as Bls12Config>::Fp2Config>,
-    ) -> Fp12<<Config as Bls12Config>::Fp12Config> {
+    // the only non zero value in inv(U1) and inv(U3), which come from Olivier's equations.
+    const C: Fq2 = QuadExtField {
+        c0: TWO_INV,
+        c1: TWO_INV_MINUS_1,
+    };
+
+    fn fp2_mul_c(x: Fq2) -> Fq2 {
+        let x0_c0 = x.c0 * C.c0;
+        let x1_c0 = x.c1 * C.c0;
+
+        let x0_c1 = x0_c0 - x.c0;
+        let x1_c1 = x1_c0 - x.c1;
+
+        QuadExtField {
+            c0: x0_c0 - x1_c1,
+            c1: x0_c1 + x1_c0,
+        }
+    }
+
+    fn fp2_mul_u1_inv(x: Fq2) -> Fq12 {
+        QuadExtField {
+            c0: CubicExtField {
+                c0: ZERO,
+                c1: ZERO,
+                c2: ZERO,
+            },
+            c1: CubicExtField {
+                c0: ZERO,
+                c1: ZERO,
+                c2: fp2_mul_c(x),
+            },
+        }
+    }
+
+    fn fp2_mul_u3_inv(x: Fq2) -> Fq12 {
+        QuadExtField {
+            c0: CubicExtField {
+                c0: ZERO,
+                c1: ZERO,
+                c2: ZERO,
+            },
+            c1: CubicExtField {
+                c0: ZERO,
+                c1: fp2_mul_c(x),
+                c2: ZERO,
+            },
+        }
+    }
+
+    const fn fp2_to_fp12(x: Fq2) -> Fq12 {
        QuadExtField {
            c0: CubicExtField {
                c0: x,
@@ -490,52 +608,59 @@ mod gt {
        }
    }

-    const fn fp_to_fp12(
-        x: <Config as Bls12Config>::Fp,
-    ) -> Fp12<<Config as Bls12Config>::Fp12Config> {
-        fp2_to_fp12(QuadExtField {
+    const fn fp_to_fp2(x: Fq) -> Fq2 {
+        QuadExtField {
            c0: x,
            c1: MontFp!("0"),
-        })
+        }
    }

-    fn ate_tangent_ev(qt: G2, evpt: G1) -> Fp12<<Config as Bls12Config>::Fp12Config> {
-        let qt = qt.inner.into_affine();
-        let evpt = evpt.inner.into_affine();
+    const fn fp_to_fp12(x: Fq) -> Fq12 {
+        fp2_to_fp12(fp_to_fp2(x))
+    }
+
+    fn ate_tangent_ev(qt: G2Affine, evpt: G1Affine, m: Fq2) -> Fq12 {
+        let qt = qt.inner;
+        let evpt = evpt.inner;

        let (xt, yt) = (qt.x, qt.y);
        let (xe, ye) = (evpt.x, evpt.y);

-        let xt = fp2_to_fp12(xt);
-        let yt = fp2_to_fp12(yt);
-        let xe = fp_to_fp12(xe);
-        let ye = fp_to_fp12(ye);
+        let l = m;
+        let mut l_xe = l;
+        l_xe.c0 *= xe;
+        l_xe.c1 *= xe;

-        let three = fp_to_fp12(MontFp!("3"));
-        let two = fp_to_fp12(MontFp!("2"));
+        let mut r0 = fp_to_fp12(ye);
+        let r1 = fp2_mul_u1_inv(l_xe);
+        let r2 = fp2_mul_u3_inv(l * xt - yt);

-        let l = three * xt.square() / (two * yt);
-        ye - (l * xe) / U1 + (l * xt - yt) / U3
+        r0.c1.c1 = r2.c1.c1;
+        r0.c1.c2 = -r1.c1.c2;
+
+        r0
    }

-    fn ate_line_ev(q1: G2, q2: G2, evpt: G1) -> Fp12<<Config as Bls12Config>::Fp12Config> {
-        let q1 = q1.inner.into_affine();
-        let q2 = q2.inner.into_affine();
-        let evpt = evpt.inner.into_affine();
+    fn ate_line_ev(q1: G2Affine, evpt: G1Affine, m: Fq2) -> Fq12 {
+        let q1 = q1.inner;
+        let evpt = evpt.inner;

        let (x1, y1) = (q1.x, q1.y);
-        let (x2, y2) = (q2.x, q2.y);
        let (xe, ye) = (evpt.x, evpt.y);

-        let x1 = fp2_to_fp12(x1);
-        let y1 = fp2_to_fp12(y1);
-        let x2 = fp2_to_fp12(x2);
-        let y2 = fp2_to_fp12(y2);
-        let xe = fp_to_fp12(xe);
-        let ye = fp_to_fp12(ye);
+        let l = m;
+        let mut l_xe = l;
+        l_xe.c0 *= xe;
+        l_xe.c1 *= xe;

-        let l = (y2 - y1) / (x2 - x1);
-        ye - (l * xe) / U1 + (l * x1 - y1) / U3
+        let mut r0 = fp_to_fp12(ye);
+        let r1 = fp2_mul_u1_inv(l * fp_to_fp2(xe));
+        let r2 = fp2_mul_u3_inv(l * x1 - y1);
+
+        r0.c1.c1 = r2.c1.c1;
+        r0.c1.c2 = -r1.c1.c2;
+
+        r0
    }

    #[allow(clippy::needless_range_loop)]
@@ -544,22 +669,24 @@ mod gt {
        let t_bits = b"110000000001000001000000100000000000000000000000000000000100000000000000001";

        let mut fk = fp_to_fp12(MontFp!("1"));
+        let p = p.normalize();
+        let q = q.normalize();
+
        let mut qk = q;

        for k in 1..t_log2 {
-            let lkk = ate_tangent_ev(qk, p);
-            qk = qk + qk;
+            let m = qk.compute_m(qk).unwrap();
+            let lkk = ate_tangent_ev(qk, p, m);
+            qk = qk.double(Some(m));
            fk = fk.square() * lkk;

            if t_bits[k] == b'1' {
-                assert_ne!(q, qk);
-                let lkp1 = if q != -qk {
-                    ate_line_ev(q, qk, p)
-                } else {
-                    fp_to_fp12(MontFp!("1"))
-                };
-                qk += q;
-                fk *= lkp1;
+                let m = q.compute_m(qk);
+                let new_qk = q.add_unequal(qk, m);
+                if !new_qk.inner.infinity {
+                    fk *= ate_line_ev(q, p, m.unwrap());
+                }
+                qk = new_qk;
            }
        }
        let mlo = MillerLoopOutput(fk);
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -1720,7 +1720,7 @@ mod cuda {

    define_cuda_server_key_bench_clean_input_scalar_fn!(
        method_name: unchecked_unsigned_overflowing_scalar_add,
-        display_name: unsigned_overflowing_scalar_add,
+        display_name: unsigned_overflowing_add,
        rng_func: default_scalar
    );

@@ -1981,7 +1981,7 @@ mod cuda {

    define_cuda_server_key_bench_clean_input_scalar_fn!(
        method_name: unsigned_overflowing_scalar_add,
-        display_name: overflowing_scalar_add,
+        display_name: overflowing_add,
        rng_func: default_scalar
    );

--- a/tfhe/benches/shortint/bench.rs
+++ b/tfhe/benches/shortint/bench.rs
@@ -553,7 +553,7 @@ define_server_key_bench_fn!(
 );
 define_server_key_bench_fn!(
    method_name: greater,
-    display_name: greater,
+    display_name: greater_than,
    BenchParamsSet::Standard
 );
 define_server_key_bench_fn!(
@@ -563,7 +563,7 @@ define_server_key_bench_fn!(
 );
 define_server_key_bench_fn!(
    method_name: less,
-    display_name: less,
+    display_name: less_than,
    BenchParamsSet::Standard
 );
 define_server_key_bench_fn!(
@@ -676,7 +676,7 @@ define_server_key_scalar_div_bench_fn!(
 );
 define_server_key_scalar_bench_fn!(
    method_name: scalar_greater,
-    display_name: greater,
+    display_name: greater_than,
    BenchParamsSet::Standard
 );
 define_server_key_scalar_bench_fn!(
@@ -686,7 +686,7 @@ define_server_key_scalar_bench_fn!(
 );
 define_server_key_scalar_bench_fn!(
    method_name: scalar_less,
-    display_name: less,
+    display_name: less_than,
    BenchParamsSet::Standard
 );
 define_server_key_scalar_bench_fn!(
--- a/tfhe/benches/shortint/casting.rs
+++ b/tfhe/benches/shortint/casting.rs
@@ -4,7 +4,7 @@ use rayon::prelude::*;
 use tfhe::shortint::prelude::*;

 pub fn pack_cast_64(c: &mut Criterion) {
-    let bench_name = "pack_cast_64";
+    let bench_name = "shortint::pack_cast_64";
    let mut bench_group = c.benchmark_group(bench_name);

    let (client_key_1, server_key_1): (ClientKey, ServerKey) =
@@ -55,7 +55,7 @@ pub fn pack_cast_64(c: &mut Criterion) {
 }

 pub fn pack_cast(c: &mut Criterion) {
-    let bench_name = "pack_cast";
+    let bench_name = "shortint::pack_cast";
    let mut bench_group = c.benchmark_group(bench_name);

    let (client_key_1, server_key_1): (ClientKey, ServerKey) =
@@ -96,7 +96,7 @@ pub fn pack_cast(c: &mut Criterion) {
 }

 pub fn cast(c: &mut Criterion) {
-    let bench_name = "cast";
+    let bench_name = "shortint::cast";
    let mut bench_group = c.benchmark_group(bench_name);

    let (client_key_1, server_key_1): (ClientKey, ServerKey) =
--- a/tfhe/docs/guides/data_versioning.md
+++ b/tfhe/docs/guides/data_versioning.md
@@ -2,7 +2,7 @@

 This document explains how to save and load versioned data using the data versioning feature.

-Starting from v0.7.0, **TFHE-rs** supports versioned data types. This allows you to store data and load it in the future without compatibility concerns. This feature is done by the `tfhe-versionable` crate.
+Starting from v0.6.4, **TFHE-rs** supports versioned data types. This allows you to store data and load it in the future without compatibility concerns. This feature is done by the `tfhe-versionable` crate.

 This versioning scheme is compatible with all the [data formats](https://serde.rs/#data-formats) supported by serde.

@@ -85,7 +85,7 @@ When possible, data will be upgraded automatically without any kind of interract
 You will find below a list of breaking changes and how to upgrade them.

 # 0.6 -> 0.7
- `crate::integer::ciphertext::CompactCiphertextList`
+- `tfhe::integer::ciphertext::CompactCiphertextList`:
  in 0.6, these lists of ciphertext were statically typed and homogenous. Since 0.7, they are heterogeneous. The new version stores for each element an information about its type (Signed, Unsigned or Boolean). Since this information were not stored before, the list is set to be made of `Unsigned` integers by default. If that is not the case, you can set its type using the following snippet:

 ```rust
@@ -142,3 +142,12 @@ pub fn main() {
    assert_eq!(-1i8, decrypted);
 }
 ```
+
+- `tfhe::{CompactFheInt, CompactFheUint, CompactFheIntList, CompactFheUintList}`:
+  The types have been deprecated, they are only kept in **TFHE-rs** for backward compatibility. They can now be accessed using the `tfhe::high_level_api::backward_compatibility::integers` module. The only functionality that is still supported is to unversionize them and expand them into regular `FheInt`, `FheUint`, `Vec<FehInt>` and `Vec<FheUint>`:
+
+```Rust
+    let loaded_ct = CompactFheUint8::unversionize(versioned_ct).unwrap();
+    let ct = loaded_ct.expand();
+```
+  Starting with v0.7, this compact list functionality is provided by the `tfhe::CompactCiphertextList` type.
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -933,6 +933,91 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
    );
 }

+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function
+/// as soon as synchronization is required
+pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    radix_lwe_input: &mut CudaVec<T>,
+    carry_out: &mut CudaVec<T>,
+    input_carries: &mut CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_input.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
+    scratch_cuda_propagate_single_carry_kb_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+    );
+    cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        radix_lwe_input.as_mut_c_ptr(0),
+        carry_out.as_mut_c_ptr(0),
+        input_carries.as_mut_c_ptr(0),
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+    );
+    cleanup_cuda_propagate_single_carry(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -29,24 +29,24 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg = 1u64;
    ///
    /// let ct = cks.encrypt(msg);
    ///
    /// // Copy to GPU
-    /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
+    /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.unchecked_bitnot(&d_ct, &mut stream);
+    /// let d_ct_res = sks.unchecked_bitnot(&d_ct, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -55,24 +55,24 @@ impl CudaServerKey {
    pub fn unchecked_bitnot<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.unchecked_bitnot_assign(&mut result, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.unchecked_bitnot_assign(&mut result, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &mut T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        // We do (-ciphertext) + (msg_mod -1) as it allows to avoid an allocation
-        cuda_lwe_ciphertext_negate_assign(&mut ct.as_mut().d_blocks, stream);
+        cuda_lwe_ciphertext_negate_assign(&mut ct.as_mut().d_blocks, streams);

        let ct_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0;

@@ -81,14 +81,21 @@ impl CudaServerKey {
        let shift_plaintext = u64::from(scalar) * delta;

        let scalar_vector = vec![shift_plaintext; ct_blocks];
-        let mut d_decomposed_scalar =
-            CudaVec::<u64>::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, stream, 0);
-        d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), stream, 0);
+        let mut d_decomposed_scalar = CudaVec::<u64>::new_async(
+            ct.as_ref().d_blocks.lwe_ciphertext_count().0,
+            streams,
+            streams.gpu_indexes[0],
+        );
+        d_decomposed_scalar.copy_from_cpu_async(
+            scalar_vector.as_slice(),
+            streams,
+            streams.gpu_indexes[0],
+        );

        cuda_lwe_ciphertext_plaintext_add_assign(
            &mut ct.as_mut().d_blocks,
            &d_decomposed_scalar,
-            stream,
+            streams,
        );
        ct.as_mut().info = ct.as_ref().info.after_bitnot();
    }
@@ -96,12 +103,12 @@ impl CudaServerKey {
    pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &mut T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.unchecked_bitnot_assign_async(ct, stream);
+            self.unchecked_bitnot_assign_async(ct, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitand between two ciphertexts encrypting integer values.
@@ -121,11 +128,11 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg1 = 201u64;
    /// let msg2 = 1u64;
@@ -134,14 +141,14 @@ impl CudaServerKey {
    /// let ct2 = cks.encrypt(msg2);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
-    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.unchecked_bitand(&d_ct1, &d_ct2, &mut stream);
+    /// let d_ct_res = sks.unchecked_bitand(&d_ct1, &d_ct2, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -151,23 +158,23 @@ impl CudaServerKey {
        &self,
        ct_left: &T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(stream) };
-        self.unchecked_bitand_assign(&mut result, ct_right, stream);
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.unchecked_bitand_assign(&mut result, ct_right, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_bitop_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
        op: BitOpType,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        assert_eq!(
            ct_left.as_ref().d_blocks.lwe_dimension(),
@@ -183,7 +190,7 @@ impl CudaServerKey {
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
                unchecked_bitop_integer_radix_kb_assign_async(
-                    stream,
+                    streams,
                    &mut ct_left.as_mut().d_blocks.0.d_vec,
                    &ct_right.as_ref().d_blocks.0.d_vec,
                    &d_bsk.d_vec,
@@ -210,7 +217,7 @@ impl CudaServerKey {
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                unchecked_bitop_integer_radix_kb_assign_async(
-                    stream,
+                    streams,
                    &mut ct_left.as_mut().d_blocks.0.d_vec,
                    &ct_right.as_ref().d_blocks.0.d_vec,
                    &d_multibit_bsk.d_vec,
@@ -242,13 +249,13 @@ impl CudaServerKey {
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, stream);
+            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::And, streams);
            ct_left.as_mut().info = ct_left.as_ref().info.after_bitand(&ct_right.as_ref().info);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitor between two ciphertexts encrypting integer values.
@@ -268,11 +275,11 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg1 = 200u64;
    /// let msg2 = 1u64;
@@ -281,14 +288,14 @@ impl CudaServerKey {
    /// let ct2 = cks.encrypt(msg2);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
-    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.unchecked_bitor(&d_ct1, &d_ct2, &mut stream);
+    /// let d_ct_res = sks.unchecked_bitor(&d_ct1, &d_ct2, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -298,10 +305,10 @@ impl CudaServerKey {
        &self,
        ct_left: &T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(stream) };
-        self.unchecked_bitor_assign(&mut result, ct_right, stream);
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.unchecked_bitor_assign(&mut result, ct_right, streams);
        result
    }

@@ -309,13 +316,13 @@ impl CudaServerKey {
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, stream);
+            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Or, streams);
            ct_left.as_mut().info = ct_left.as_ref().info.after_bitor(&ct_right.as_ref().info);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitxor between two ciphertexts encrypting integer values.
@@ -335,11 +342,11 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg1 = 49;
    /// let msg2 = 64;
@@ -348,14 +355,14 @@ impl CudaServerKey {
    /// let ct2 = cks.encrypt(msg2);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
-    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.unchecked_bitxor(&d_ct1, &d_ct2, &mut stream);
+    /// let d_ct_res = sks.unchecked_bitxor(&d_ct1, &d_ct2, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -365,10 +372,10 @@ impl CudaServerKey {
        &self,
        ct_left: &T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(stream) };
-        self.unchecked_bitxor_assign(&mut result, ct_right, stream);
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.unchecked_bitxor_assign(&mut result, ct_right, streams);
        result
    }

@@ -376,13 +383,13 @@ impl CudaServerKey {
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, stream);
+            self.unchecked_bitop_assign_async(ct_left, ct_right, BitOpType::Xor, streams);
            ct_left.as_mut().info = ct_left.as_ref().info.after_bitxor(&ct_right.as_ref().info);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitand between two ciphertexts encrypting integer values.
@@ -402,11 +409,11 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg1 = 201u64;
    /// let msg2 = 1u64;
@@ -415,14 +422,14 @@ impl CudaServerKey {
    /// let ct2 = cks.encrypt(msg2);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
-    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.bitand(&d_ct1, &d_ct2, &mut stream);
+    /// let d_ct_res = sks.bitand(&d_ct1, &d_ct2, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -432,22 +439,22 @@ impl CudaServerKey {
        &self,
        ct_left: &T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(stream) };
-        self.bitand_assign(&mut result, ct_right, stream);
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.bitand_assign(&mut result, ct_right, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn bitand_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        let mut tmp_rhs;

@@ -458,36 +465,36 @@ impl CudaServerKey {
            ) {
                (true, true) => (ct_left, ct_right),
                (true, false) => {
-                    tmp_rhs = ct_right.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    tmp_rhs = ct_right.duplicate_async(streams);
+                    self.full_propagate_assign_async(&mut tmp_rhs, streams);
                    (ct_left, &tmp_rhs)
                }
                (false, true) => {
-                    self.full_propagate_assign_async(ct_left, stream);
+                    self.full_propagate_assign_async(ct_left, streams);
                    (ct_left, ct_right)
                }
                (false, false) => {
-                    tmp_rhs = ct_right.duplicate_async(stream);
+                    tmp_rhs = ct_right.duplicate_async(streams);

-                    self.full_propagate_assign_async(ct_left, stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign_async(ct_left, streams);
+                    self.full_propagate_assign_async(&mut tmp_rhs, streams);
                    (ct_left, &tmp_rhs)
                }
            }
        };
-        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, stream);
+        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::And, streams);
    }

    pub fn bitand_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.bitand_assign_async(ct_left, ct_right, stream);
+            self.bitand_assign_async(ct_left, ct_right, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitor between two ciphertexts encrypting integer values.
@@ -507,11 +514,11 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg1 = 201u64;
    /// let msg2 = 1u64;
@@ -520,14 +527,14 @@ impl CudaServerKey {
    /// let ct2 = cks.encrypt(msg2);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
-    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.bitor(&d_ct1, &d_ct2, &mut stream);
+    /// let d_ct_res = sks.bitor(&d_ct1, &d_ct2, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -537,22 +544,22 @@ impl CudaServerKey {
        &self,
        ct_left: &T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(stream) };
-        self.bitor_assign(&mut result, ct_right, stream);
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.bitor_assign(&mut result, ct_right, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn bitor_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        let mut tmp_rhs;

@@ -562,36 +569,36 @@ impl CudaServerKey {
        ) {
            (true, true) => (ct_left, ct_right),
            (true, false) => {
-                tmp_rhs = ct_right.duplicate_async(stream);
-                self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                tmp_rhs = ct_right.duplicate_async(streams);
+                self.full_propagate_assign_async(&mut tmp_rhs, streams);
                (ct_left, &tmp_rhs)
            }
            (false, true) => {
-                self.full_propagate_assign_async(ct_left, stream);
+                self.full_propagate_assign_async(ct_left, streams);
                (ct_left, ct_right)
            }
            (false, false) => {
-                tmp_rhs = ct_right.duplicate_async(stream);
+                tmp_rhs = ct_right.duplicate_async(streams);

-                self.full_propagate_assign_async(ct_left, stream);
-                self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign_async(&mut tmp_rhs, streams);
                (ct_left, &tmp_rhs)
            }
        };

-        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, stream);
+        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Or, streams);
    }

    pub fn bitor_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.bitor_assign_async(ct_left, ct_right, stream);
+            self.bitor_assign_async(ct_left, ct_right, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitxor between two ciphertexts encrypting integer values.
@@ -611,11 +618,11 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg1 = 201u64;
    /// let msg2 = 1u64;
@@ -624,14 +631,14 @@ impl CudaServerKey {
    /// let ct2 = cks.encrypt(msg2);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut stream);
-    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.bitxor(&d_ct1, &d_ct2, &mut stream);
+    /// let d_ct_res = sks.bitxor(&d_ct1, &d_ct2, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
@@ -641,22 +648,22 @@ impl CudaServerKey {
        &self,
        ct_left: &T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(stream) };
-        self.bitxor_assign(&mut result, ct_right, stream);
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.bitxor_assign(&mut result, ct_right, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn bitxor_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        let mut tmp_rhs;

@@ -666,36 +673,36 @@ impl CudaServerKey {
        ) {
            (true, true) => (ct_left, ct_right),
            (true, false) => {
-                tmp_rhs = ct_right.duplicate_async(stream);
-                self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                tmp_rhs = ct_right.duplicate_async(streams);
+                self.full_propagate_assign_async(&mut tmp_rhs, streams);
                (ct_left, &tmp_rhs)
            }
            (false, true) => {
-                self.full_propagate_assign_async(ct_left, stream);
+                self.full_propagate_assign_async(ct_left, streams);
                (ct_left, ct_right)
            }
            (false, false) => {
-                tmp_rhs = ct_right.duplicate_async(stream);
+                tmp_rhs = ct_right.duplicate_async(streams);

-                self.full_propagate_assign_async(ct_left, stream);
-                self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign_async(&mut tmp_rhs, streams);
                (ct_left, &tmp_rhs)
            }
        };

-        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, stream);
+        self.unchecked_bitop_assign_async(lhs, rhs, BitOpType::Xor, streams);
    }

    pub fn bitxor_assign<T: CudaIntegerRadixCiphertext>(
        &self,
        ct_left: &mut T,
        ct_right: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        unsafe {
-            self.bitxor_assign_async(ct_left, ct_right, stream);
+            self.bitxor_assign_async(ct_left, ct_right, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically bitnot for an encrypted integer value.
@@ -716,55 +723,55 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg = 1u64;
    ///
    /// let ct = cks.encrypt(msg);
    ///
    /// // Copy to GPU
-    /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
+    /// let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
    ///
    /// // Compute homomorphically a bitwise and:
-    /// let d_ct_res = sks.bitnot(&d_ct, &mut stream);
+    /// let d_ct_res = sks.bitnot(&d_ct, &mut streams);
    ///
    /// // Copy back to CPU
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// // Decrypt:
    /// let dec: u64 = cks.decrypt(&ct_res);
    /// assert_eq!(dec, !msg % 256);
    /// ```
-    pub fn bitnot<T: CudaIntegerRadixCiphertext>(&self, ct: &T, stream: &CudaStreams) -> T {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.bitnot_assign(&mut result, stream);
+    pub fn bitnot<T: CudaIntegerRadixCiphertext>(&self, ct: &T, streams: &CudaStreams) -> T {
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.bitnot_assign(&mut result, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn bitnot_assign_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ct: &mut T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) {
        if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, stream);
+            self.full_propagate_assign_async(ct, streams);
        }

-        self.unchecked_bitnot_assign_async(ct, stream);
+        self.unchecked_bitnot_assign_async(ct, streams);
    }

-    pub fn bitnot_assign<T: CudaIntegerRadixCiphertext>(&self, ct: &mut T, stream: &CudaStreams) {
+    pub fn bitnot_assign<T: CudaIntegerRadixCiphertext>(&self, ct: &mut T, streams: &CudaStreams) {
        unsafe {
-            self.bitnot_assign_async(ct, stream);
+            self.bitnot_assign_async(ct, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }
 }
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -14,7 +14,8 @@ use crate::integer::gpu::ciphertext::{
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
    apply_univariate_lut_kb_async, full_propagate_assign_async,
-    propagate_single_carry_assign_async, CudaServerKey, PBSType,
+    propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async,
+    CudaServerKey, PBSType,
 };
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::engine::fill_accumulator;
@@ -224,6 +225,80 @@ impl CudaServerKey {
        carry_out
    }

+    /// # Safety
+    ///
+    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until stream is synchronized
+    #[allow(dead_code)]
+    pub(crate) unsafe fn propagate_single_carry_get_input_carries_assign_async<T>(
+        &self,
+        ct: &mut T,
+        input_carries: &mut T,
+        streams: &CudaStreams,
+    ) -> T
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
+        let ciphertext = ct.as_mut();
+        let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                propagate_single_carry_get_input_carries_assign_async(
+                    streams,
+                    &mut ciphertext.d_blocks.0.d_vec,
+                    &mut carry_out.as_mut().d_blocks.0.d_vec,
+                    &mut input_carries.as_mut().d_blocks.0.d_vec,
+                    &d_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                );
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                propagate_single_carry_get_input_carries_assign_async(
+                    streams,
+                    &mut ciphertext.d_blocks.0.d_vec,
+                    &mut carry_out.as_mut().d_blocks.0.d_vec,
+                    &mut input_carries.as_mut().d_blocks.0.d_vec,
+                    &d_multibit_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    d_multibit_bsk.input_lwe_dimension(),
+                    d_multibit_bsk.glwe_dimension(),
+                    d_multibit_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count(),
+                    d_multibit_bsk.decomp_base_log(),
+                    num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
+                    PBSType::MultiBit,
+                    d_multibit_bsk.grouping_factor,
+                );
+            }
+        };
+        ciphertext.info.blocks.iter_mut().for_each(|b| {
+            b.degree = Degree::new(b.message_modulus.0 - 1);
+            b.noise_level = NoiseLevel::NOMINAL;
+        });
+        carry_out.as_mut().info.blocks.iter_mut().for_each(|b| {
+            b.degree = Degree::new(1);
+            b.noise_level = NoiseLevel::NOMINAL;
+        });
+        carry_out
+    }
+
    /// # Safety
    ///
    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
@@ -327,25 +402,26 @@ impl CudaServerKey {
        &self,
        ct: &T,
        num_blocks: usize,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
        let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 + num_blocks;
        let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
        let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
        let shift = num_blocks * lwe_size.0;

-        let mut extended_ct_vec =
-            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
+        let mut extended_ct_vec = unsafe {
+            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
+        };
        unsafe {
-            extended_ct_vec.memset_async(0u64, stream, 0);
+            extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]);
            extended_ct_vec.copy_self_range_gpu_to_gpu_async(
                shift..,
                &ct.as_ref().d_blocks.0.d_vec,
-                stream,
+                streams,
                0,
            );
        }
-        stream.synchronize();
+        streams.synchronize();
        let extended_ct_list = CudaLweCiphertextList::from_cuda_vec(
            extended_ct_vec,
            LweCiphertextCount(new_num_blocks),
@@ -398,19 +474,24 @@ impl CudaServerKey {
        &self,
        ct: &T,
        num_blocks: usize,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
        let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 + num_blocks;
        let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
        let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();

-        let mut extended_ct_vec =
-            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
+        let mut extended_ct_vec = unsafe {
+            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
+        };
        unsafe {
-            extended_ct_vec.memset_async(0u64, stream, 0);
-            extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0);
+            extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]);
+            extended_ct_vec.copy_from_gpu_async(
+                &ct.as_ref().d_blocks.0.d_vec,
+                streams,
+                streams.gpu_indexes[0],
+            );
        }
-        stream.synchronize();
+        streams.synchronize();
        let extended_ct_list = CudaLweCiphertextList::from_cuda_vec(
            extended_ct_vec,
            LweCiphertextCount(new_num_blocks),
@@ -463,24 +544,25 @@ impl CudaServerKey {
        &self,
        ct: &T,
        num_blocks: usize,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
        let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 - num_blocks;
        let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
        let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
        let shift = num_blocks * lwe_size.0;

-        let mut trimmed_ct_vec =
-            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
+        let mut trimmed_ct_vec = unsafe {
+            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
+        };
        unsafe {
            trimmed_ct_vec.copy_src_range_gpu_to_gpu_async(
                shift..,
                &ct.as_ref().d_blocks.0.d_vec,
-                stream,
+                streams,
                0,
            );
        }
-        stream.synchronize();
+        streams.synchronize();
        let trimmed_ct_list = CudaLweCiphertextList::from_cuda_vec(
            trimmed_ct_vec,
            LweCiphertextCount(new_num_blocks),
@@ -530,24 +612,25 @@ impl CudaServerKey {
        &self,
        ct: &T,
        num_blocks: usize,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
        let new_num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 - num_blocks;
        let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
        let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
        let shift = new_num_blocks * lwe_size.0;

-        let mut trimmed_ct_vec =
-            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, stream, 0) };
+        let mut trimmed_ct_vec = unsafe {
+            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
+        };
        unsafe {
            trimmed_ct_vec.copy_src_range_gpu_to_gpu_async(
                0..shift,
                &ct.as_ref().d_blocks.0.d_vec,
-                stream,
+                streams,
                0,
            );
        }
-        stream.synchronize();
+        streams.synchronize();
        let trimmed_ct_list = CudaLweCiphertextList::from_cuda_vec(
            trimmed_ct_vec,
            LweCiphertextCount(new_num_blocks),
@@ -594,7 +677,7 @@ impl CudaServerKey {
        &self,
        ct: &T,
        num_blocks: usize,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T {
        let message_modulus = self.message_modulus.0 as u64;
        let num_bits_in_block = message_modulus.ilog2();
@@ -612,28 +695,40 @@ impl CudaServerKey {
        let lwe_size = ct.as_ref().d_blocks.0.lwe_dimension.to_lwe_size().0;

        // Allocate the necessary amount of memory
-        let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, stream, 0);
+        let mut output_radix = CudaVec::new(
+            new_num_ct_blocks * lwe_size,
+            streams,
+            streams.gpu_indexes[0],
+        );
        unsafe {
-            output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, stream, 0);
+            output_radix.copy_from_gpu_async(
+                &ct.as_ref().d_blocks.0.d_vec,
+                streams,
+                streams.gpu_indexes[0],
+            );
            // Get the last ct block
            let last_block = ct
                .as_ref()
                .d_blocks
                .0
                .d_vec
-                .as_slice(lwe_size * (num_ct_blocks - 1).., 0)
+                .as_slice(lwe_size * (num_ct_blocks - 1).., streams.gpu_indexes[0])
                .unwrap();
            let mut output_slice = output_radix
-                .as_mut_slice(lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks, 0)
+                .as_mut_slice(
+                    lwe_size * num_ct_blocks..lwe_size * new_num_ct_blocks,
+                    streams.gpu_indexes[0],
+                )
                .unwrap();
-            let (padding_block, new_blocks) = output_slice.split_at_mut(lwe_size, 0);
+            let (padding_block, new_blocks) =
+                output_slice.split_at_mut(lwe_size, streams.gpu_indexes[0]);
            let mut padding_block = padding_block.unwrap();
            let mut new_blocks = new_blocks.unwrap();

            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
                    apply_univariate_lut_kb_async(
-                        stream,
+                        streams,
                        &mut padding_block,
                        &last_block,
                        padding_block_creator_lut.acc.as_ref(),
@@ -657,7 +752,7 @@ impl CudaServerKey {
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                    apply_univariate_lut_kb_async(
-                        stream,
+                        streams,
                        &mut padding_block,
                        &last_block,
                        padding_block_creator_lut.acc.as_ref(),
@@ -682,12 +777,12 @@ impl CudaServerKey {
            }
            for i in 0..num_blocks - 1 {
                let mut output_block = new_blocks
-                    .get_mut(lwe_size * i..lwe_size * (i + 1), 0)
+                    .get_mut(lwe_size * i..lwe_size * (i + 1), streams.gpu_indexes[0])
                    .unwrap();
-                output_block.copy_from_gpu_async(&padding_block, stream, 0);
+                output_block.copy_from_gpu_async(&padding_block, streams, streams.gpu_indexes[0]);
            }
        }
-        stream.synchronize();
+        streams.synchronize();
        let output_lwe_list = CudaLweCiphertextList(CudaLweList {
            d_vec: output_radix,
            lwe_ciphertext_count: LweCiphertextCount(new_num_ct_blocks),
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
@@ -11,14 +11,14 @@ use crate::integer::gpu::{
 impl CudaServerKey {
    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_bitop_assign_async<Scalar, T>(
        &self,
        ct: &mut T,
        rhs: Scalar,
        op: BitOpType,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
@@ -31,12 +31,13 @@ impl CudaServerKey {
            .map(|x| x as u64)
            .collect::<Vec<_>>();

-        let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, stream, 0);
+        let clear_blocks =
+            CudaVec::from_cpu_async(&h_clear_blocks, streams, streams.gpu_indexes[0]);

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
                unchecked_scalar_bitop_integer_radix_kb_assign_async(
-                    stream,
+                    streams,
                    &mut ct.as_mut().d_blocks.0.d_vec,
                    &clear_blocks,
                    &d_bsk.d_vec,
@@ -63,7 +64,7 @@ impl CudaServerKey {
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                unchecked_scalar_bitop_integer_radix_kb_assign_async(
-                    stream,
+                    streams,
                    &mut ct.as_mut().d_blocks.0.d_vec,
                    &clear_blocks,
                    &d_multibit_bsk.d_vec,
@@ -91,13 +92,18 @@ impl CudaServerKey {
        }
    }

-    pub fn unchecked_scalar_bitand<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
+    pub fn unchecked_scalar_bitand<Scalar, T>(
+        &self,
+        ct: &T,
+        rhs: Scalar,
+        streams: &CudaStreams,
+    ) -> T
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.unchecked_scalar_bitand_assign(&mut result, rhs, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.unchecked_scalar_bitand_assign(&mut result, rhs, streams);
        result
    }

@@ -105,25 +111,25 @@ impl CudaServerKey {
        &self,
        ct: &mut T,
        rhs: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, stream);
+            self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
            ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

-    pub fn unchecked_scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
+    pub fn unchecked_scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.unchecked_scalar_bitor_assign(&mut result, rhs, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.unchecked_scalar_bitor_assign(&mut result, rhs, streams);
        result
    }

@@ -131,25 +137,30 @@ impl CudaServerKey {
        &self,
        ct: &mut T,
        rhs: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, stream);
+            self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
            ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

-    pub fn unchecked_scalar_bitxor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
+    pub fn unchecked_scalar_bitxor<Scalar, T>(
+        &self,
+        ct: &T,
+        rhs: Scalar,
+        streams: &CudaStreams,
+    ) -> T
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.unchecked_scalar_bitxor_assign(&mut result, rhs, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.unchecked_scalar_bitxor_assign(&mut result, rhs, streams);
        result
    }

@@ -157,138 +168,138 @@ impl CudaServerKey {
        &self,
        ct: &mut T,
        rhs: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, stream);
+            self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
            ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_bitand_assign_async<Scalar, T>(
        &self,
        ct: &mut T,
        rhs: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, stream);
+            self.full_propagate_assign_async(ct, streams);
        }
-        self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, stream);
+        self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
        ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
    }

-    pub fn scalar_bitand_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams)
+    pub fn scalar_bitand_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.scalar_bitand_assign_async(ct, rhs, stream);
+            self.scalar_bitand_assign_async(ct, rhs, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

-    pub fn scalar_bitand<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
+    pub fn scalar_bitand<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.scalar_bitand_assign(&mut result, rhs, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.scalar_bitand_assign(&mut result, rhs, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_bitor_assign_async<Scalar, T>(
        &self,
        ct: &mut T,
        rhs: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, stream);
+            self.full_propagate_assign_async(ct, streams);
        }
-        self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, stream);
+        self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
        ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
    }

-    pub fn scalar_bitor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams)
+    pub fn scalar_bitor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.scalar_bitor_assign_async(ct, rhs, stream);
+            self.scalar_bitor_assign_async(ct, rhs, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

-    pub fn scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
+    pub fn scalar_bitor<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.scalar_bitor_assign(&mut result, rhs, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.scalar_bitor_assign(&mut result, rhs, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_bitxor_assign_async<Scalar, T>(
        &self,
        ct: &mut T,
        rhs: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, stream);
+            self.full_propagate_assign_async(ct, streams);
        }
-        self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, stream);
+        self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
        ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
    }

-    pub fn scalar_bitxor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, stream: &CudaStreams)
+    pub fn scalar_bitxor_assign<Scalar, T>(&self, ct: &mut T, rhs: Scalar, streams: &CudaStreams)
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.scalar_bitxor_assign_async(ct, rhs, stream);
+            self.scalar_bitxor_assign_async(ct, rhs, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

-    pub fn scalar_bitxor<Scalar, T>(&self, ct: &T, rhs: Scalar, stream: &CudaStreams) -> T
+    pub fn scalar_bitxor<Scalar, T>(&self, ct: &T, rhs: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: DecomposableInto<u8>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.scalar_bitxor_assign(&mut result, rhs, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.scalar_bitxor_assign(&mut result, rhs, streams);
        result
    }
 }
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
@@ -102,15 +102,15 @@ impl CudaServerKey {

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_signed_and_unsigned_scalar_comparison_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
        op: ComparisonType,
        signed_with_positive_scalar: bool,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
@@ -122,7 +122,7 @@ impl CudaServerKey {
                ComparisonType::GT | ComparisonType::GE | ComparisonType::NE => 1,
                _ => 0,
            };
-            let ct_res: T = self.create_trivial_radix(value, 1, stream);
+            let ct_res: T = self.create_trivial_radix(value, 1, streams);
            return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
        }

@@ -144,7 +144,7 @@ impl CudaServerKey {
                ComparisonType::LT | ComparisonType::LE | ComparisonType::NE => 1,
                _ => 0,
            };
-            let ct_res: T = self.create_trivial_radix(value, 1, stream);
+            let ct_res: T = self.create_trivial_radix(value, 1, streams);
            return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
        }

@@ -153,7 +153,8 @@ impl CudaServerKey {
        // as we will handle them separately.
        scalar_blocks.truncate(ct.as_ref().d_blocks.lwe_ciphertext_count().0);

-        let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, stream, 0);
+        let d_scalar_blocks: CudaVec<u64> =
+            CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]);

        let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();

@@ -161,7 +162,7 @@ impl CudaServerKey {
            ct.as_ref().d_blocks.lwe_dimension(),
            LweCiphertextCount(1),
            CiphertextModulus::new_native(),
-            stream,
+            streams,
        );
        let mut block_info = ct.as_ref().info.blocks[0];
        block_info.degree = Degree::new(0);
@@ -174,7 +175,7 @@ impl CudaServerKey {
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
                unchecked_scalar_comparison_integer_radix_kb_async(
-                    stream,
+                    streams,
                    &mut result.as_mut().ciphertext.d_blocks.0.d_vec,
                    &ct.as_ref().d_blocks.0.d_vec,
                    &d_scalar_blocks,
@@ -204,7 +205,7 @@ impl CudaServerKey {
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                unchecked_scalar_comparison_integer_radix_kb_async(
-                    stream,
+                    streams,
                    &mut result.as_mut().ciphertext.d_blocks.0.d_vec,
                    &ct.as_ref().d_blocks.0.d_vec,
                    &d_scalar_blocks,
@@ -239,14 +240,14 @@ impl CudaServerKey {

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_comparison_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
        op: ComparisonType,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
@@ -260,12 +261,12 @@ impl CudaServerKey {
                    // Scalar is greater than the bounds, so ciphertext is smaller
                    let result: T = match op {
                        ComparisonType::LT | ComparisonType::LE => {
-                            self.create_trivial_radix(1, num_blocks, stream)
+                            self.create_trivial_radix(1, num_blocks, streams)
                        }
                        _ => self.create_trivial_radix(
                            0,
                            ct.as_ref().d_blocks.lwe_ciphertext_count().0,
-                            stream,
+                            streams,
                        ),
                    };
                    return CudaBooleanBlock::from_cuda_radix_ciphertext(result.into_inner());
@@ -274,12 +275,12 @@ impl CudaServerKey {
                    // Scalar is smaller than the bounds, so ciphertext is bigger
                    let result: T = match op {
                        ComparisonType::GT | ComparisonType::GE => {
-                            self.create_trivial_radix(1, num_blocks, stream)
+                            self.create_trivial_radix(1, num_blocks, streams)
                        }
                        _ => self.create_trivial_radix(
                            0,
                            ct.as_ref().d_blocks.lwe_ciphertext_count().0,
-                            stream,
+                            streams,
                        ),
                    };
                    return CudaBooleanBlock::from_cuda_radix_ciphertext(result.into_inner());
@@ -292,29 +293,29 @@ impl CudaServerKey {

            if scalar >= Scalar::ZERO {
                self.unchecked_signed_and_unsigned_scalar_comparison_async(
-                    ct, scalar, op, true, stream,
+                    ct, scalar, op, true, streams,
                )
            } else {
-                let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, stream);
-                self.unchecked_comparison_async(ct, &scalar_as_trivial, op, stream)
+                let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams);
+                self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams)
            }
        } else {
            // Unsigned
            self.unchecked_signed_and_unsigned_scalar_comparison_async(
-                ct, scalar, op, false, stream,
+                ct, scalar, op, false, streams,
            )
        }
    }
    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_minmax_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
        op: ComparisonType,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T
    where
        T: CudaIntegerRadixCiphertext,
@@ -327,16 +328,17 @@ impl CudaServerKey {
                .iter_as::<u64>()
                .collect::<Vec<_>>();

-        let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, stream, 0);
+        let d_scalar_blocks: CudaVec<u64> =
+            CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]);

        let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();

-        let mut result = ct.duplicate_async(stream);
+        let mut result = ct.duplicate_async(streams);

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
                unchecked_scalar_comparison_integer_radix_kb_async(
-                    stream,
+                    streams,
                    &mut result.as_mut().d_blocks.0.d_vec,
                    &ct.as_ref().d_blocks.0.d_vec,
                    &d_scalar_blocks,
@@ -366,7 +368,7 @@ impl CudaServerKey {
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                unchecked_scalar_comparison_integer_radix_kb_async(
-                    stream,
+                    streams,
                    &mut result.as_mut().d_blocks.0.d_vec,
                    &ct.as_ref().d_blocks.0.d_vec,
                    &d_scalar_blocks,
@@ -401,45 +403,45 @@ impl CudaServerKey {

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_eq_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
        Scalar: DecomposableInto<u64>,
    {
-        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::EQ, stream)
+        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::EQ, streams)
    }

    pub fn unchecked_scalar_eq<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
        Scalar: DecomposableInto<u64>,
    {
-        let result = unsafe { self.unchecked_scalar_eq_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_eq_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_eq_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
@@ -449,12 +451,12 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_eq_async(lhs, scalar, stream)
+        self.unchecked_scalar_eq_async(lhs, scalar, streams)
    }

    /// Compares for equality 2 ciphertexts
@@ -473,12 +475,12 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// let size = 4;
    ///
    /// // Generate the client key and the server key:
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &streams);
    ///
    /// let msg1 = 14u64;
    /// let msg2 = 97u64;
@@ -486,12 +488,12 @@ impl CudaServerKey {
    /// let ct1 = cks.encrypt(msg1);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
    ///
-    /// let d_ct_res = sks.scalar_eq(&d_ct1, msg2, &stream);
+    /// let d_ct_res = sks.scalar_eq(&d_ct1, msg2, &streams);
    ///
    /// // Copy the result back to CPU
-    /// let ct_res = d_ct_res.to_boolean_block(&stream);
+    /// let ct_res = d_ct_res.to_boolean_block(&streams);
    ///
    /// // Decrypt:
    /// let dec_result = cks.decrypt_bool(&ct_res);
@@ -501,26 +503,26 @@ impl CudaServerKey {
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
        Scalar: DecomposableInto<u64>,
    {
-        let result = unsafe { self.scalar_eq_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_eq_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_ne_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
@@ -530,12 +532,12 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_ne_async(lhs, scalar, stream)
+        self.unchecked_scalar_ne_async(lhs, scalar, streams)
    }

    /// Compares for equality 2 ciphertexts
@@ -554,12 +556,12 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// let size = 4;
    ///
    /// // Generate the client key and the server key:
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &streams);
    ///
    /// let msg1 = 14u64;
    /// let msg2 = 97u64;
@@ -567,12 +569,12 @@ impl CudaServerKey {
    /// let ct1 = cks.encrypt(msg1);
    ///
    /// // Copy to GPU
-    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &stream);
+    /// let mut d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
    ///
-    /// let d_ct_res = sks.scalar_ne(&d_ct1, msg2, &stream);
+    /// let d_ct_res = sks.scalar_ne(&d_ct1, msg2, &streams);
    ///
    /// // Copy the result back to CPU
-    /// let ct_res = d_ct_res.to_boolean_block(&stream);
+    /// let ct_res = d_ct_res.to_boolean_block(&streams);
    ///
    /// // Decrypt:
    /// let dec_result = cks.decrypt_bool(&ct_res);
@@ -582,185 +584,185 @@ impl CudaServerKey {
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_ne_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_ne_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_ne_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
        Scalar: DecomposableInto<u64>,
    {
-        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::NE, stream)
+        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::NE, streams)
    }

    pub fn unchecked_scalar_ne<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        T: CudaIntegerRadixCiphertext,
        Scalar: DecomposableInto<u64>,
    {
-        let result = unsafe { self.unchecked_scalar_ne_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_ne_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_gt_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GT, stream)
+        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GT, streams)
    }

    pub fn unchecked_scalar_gt<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.unchecked_scalar_gt_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_gt_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_ge_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GE, stream)
+        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::GE, streams)
    }

    pub fn unchecked_scalar_ge<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.unchecked_scalar_ge_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_ge_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_lt_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LT, stream)
+        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LT, streams)
    }

    pub fn unchecked_scalar_lt<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.unchecked_scalar_lt_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_lt_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_le_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LE, stream)
+        self.unchecked_scalar_comparison_async(ct, scalar, ComparisonType::LE, streams)
    }

    pub fn unchecked_scalar_le<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.unchecked_scalar_le_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_le_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }
    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_gt_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
@@ -770,38 +772,38 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_gt_async(lhs, scalar, stream)
+        self.unchecked_scalar_gt_async(lhs, scalar, streams)
    }

    pub fn scalar_gt<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_gt_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_gt_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_ge_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
@@ -811,38 +813,38 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_ge_async(lhs, scalar, stream)
+        self.unchecked_scalar_ge_async(lhs, scalar, streams)
    }

    pub fn scalar_ge<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_ge_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_ge_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_lt_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
@@ -852,37 +854,37 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_lt_async(lhs, scalar, stream)
+        self.unchecked_scalar_lt_async(lhs, scalar, streams)
    }

    pub fn scalar_lt<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_lt_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_lt_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }
    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_le_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
@@ -892,92 +894,102 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_le_async(lhs, scalar, stream)
+        self.unchecked_scalar_le_async(lhs, scalar, streams)
    }

    pub fn scalar_le<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> CudaBooleanBlock
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_le_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_le_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_max_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MAX, stream)
+        self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MAX, streams)
    }

-    pub fn unchecked_scalar_max<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
+    pub fn unchecked_scalar_max<Scalar, T>(
+        &self,
+        ct: &T,
+        scalar: Scalar,
+        streams: &CudaStreams,
+    ) -> T
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.unchecked_scalar_max_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_max_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_min_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MIN, stream)
+        self.unchecked_scalar_minmax_async(ct, scalar, ComparisonType::MIN, streams)
    }

-    pub fn unchecked_scalar_min<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
+    pub fn unchecked_scalar_min<Scalar, T>(
+        &self,
+        ct: &T,
+        scalar: Scalar,
+        streams: &CudaStreams,
+    ) -> T
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.unchecked_scalar_min_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.unchecked_scalar_min_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_max_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T
    where
        Scalar: DecomposableInto<u64>,
@@ -987,33 +999,33 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_max_async(lhs, scalar, stream)
+        self.unchecked_scalar_max_async(lhs, scalar, streams)
    }

-    pub fn scalar_max<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
+    pub fn scalar_max<Scalar, T>(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_max_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_max_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_min_async<Scalar, T>(
        &self,
        ct: &T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) -> T
    where
        Scalar: DecomposableInto<u64>,
@@ -1023,21 +1035,21 @@ impl CudaServerKey {
        let lhs = if ct.block_carries_are_empty() {
            ct
        } else {
-            tmp_lhs = ct.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_lhs, stream);
+            tmp_lhs = ct.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_lhs, streams);
            &tmp_lhs
        };

-        self.unchecked_scalar_min_async(lhs, scalar, stream)
+        self.unchecked_scalar_min_async(lhs, scalar, streams)
    }

-    pub fn scalar_min<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
+    pub fn scalar_min<Scalar, T>(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: DecomposableInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let result = unsafe { self.scalar_min_async(ct, scalar, stream) };
-        stream.synchronize();
+        let result = unsafe { self.scalar_min_async(ct, scalar, streams) };
+        streams.synchronize();
        result
    }
 }
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
@@ -26,50 +26,59 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg = 30;
    /// let scalar = 3;
    ///
    /// let ct = cks.encrypt(msg);
-    /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
+    /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
    ///
    /// // Compute homomorphically a scalar multiplication:
-    /// let d_ct_res = sks.unchecked_scalar_mul(&d_ct, scalar, &mut stream);
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let d_ct_res = sks.unchecked_scalar_mul(&d_ct, scalar, &mut streams);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// let clear: u64 = cks.decrypt(&ct_res);
    /// assert_eq!(scalar * msg, clear);
    /// ```
-    pub fn unchecked_scalar_mul<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
+    pub fn unchecked_scalar_mul<Scalar, T>(
+        &self,
+        ct: &T,
+        scalar: Scalar,
+        streams: &CudaStreams,
+    ) -> T
    where
        Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.unchecked_scalar_mul_assign(&mut result, scalar, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.unchecked_scalar_mul_assign(&mut result, scalar, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn unchecked_scalar_mul_assign_async<Scalar, T>(
        &self,
        ct: &mut T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
        if scalar == Scalar::ZERO {
-            ct.as_mut().d_blocks.0.d_vec.memset_async(0, stream, 0);
+            ct.as_mut()
+                .d_blocks
+                .0
+                .d_vec
+                .memset_async(0, streams, streams.gpu_indexes[0]);
            return;
        }

@@ -80,7 +89,7 @@ impl CudaServerKey {
        if scalar.is_power_of_two() {
            // Shifting cost one bivariate PBS so its always faster
            // than multiplying
-            self.unchecked_scalar_left_shift_assign_async(ct, scalar.ilog2() as u64, stream);
+            self.unchecked_scalar_left_shift_assign_async(ct, scalar.ilog2() as u64, streams);
            return;
        }
        let ciphertext = ct.as_mut();
@@ -104,7 +113,7 @@ impl CudaServerKey {
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
                unchecked_scalar_mul_integer_radix_kb_async(
-                    stream,
+                    streams,
                    &mut ct.as_mut().d_blocks.0.d_vec,
                    decomposed_scalar.as_slice(),
                    has_at_least_one_set.as_slice(),
@@ -129,7 +138,7 @@ impl CudaServerKey {
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                unchecked_scalar_mul_integer_radix_kb_async(
-                    stream,
+                    streams,
                    &mut ct.as_mut().d_blocks.0.d_vec,
                    decomposed_scalar.as_slice(),
                    has_at_least_one_set.as_slice(),
@@ -161,15 +170,15 @@ impl CudaServerKey {
        &self,
        ct: &mut T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.unchecked_scalar_mul_assign_async(ct, scalar, stream);
+            self.unchecked_scalar_mul_assign_async(ct, scalar, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }

    /// Computes homomorphically a multiplication between a scalar and a ciphertext.
@@ -189,63 +198,63 @@ impl CudaServerKey {
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
    ///
    /// let gpu_index = 0;
-    /// let mut stream = CudaStreams::new_single_gpu(gpu_index);
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
    ///
    /// // We have 4 * 2 = 8 bits of message
    /// let size = 4;
-    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut stream);
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, size, &mut streams);
    ///
    /// let msg = 30;
    /// let scalar = 3;
    ///
    /// let ct = cks.encrypt(msg);
-    /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut stream);
+    /// let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &mut streams);
    ///
    /// // Compute homomorphically a scalar multiplication:
-    /// let d_ct_res = sks.scalar_mul(&d_ct, scalar, &mut stream);
-    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut stream);
+    /// let d_ct_res = sks.scalar_mul(&d_ct, scalar, &mut streams);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&mut streams);
    ///
    /// let clear: u64 = cks.decrypt(&ct_res);
    /// assert_eq!(scalar * msg, clear);
    /// ```
-    pub fn scalar_mul<Scalar, T>(&self, ct: &T, scalar: Scalar, stream: &CudaStreams) -> T
+    pub fn scalar_mul<Scalar, T>(&self, ct: &T, scalar: Scalar, streams: &CudaStreams) -> T
    where
        Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
-        let mut result = unsafe { ct.duplicate_async(stream) };
-        self.scalar_mul_assign(&mut result, scalar, stream);
+        let mut result = unsafe { ct.duplicate_async(streams) };
+        self.scalar_mul_assign(&mut result, scalar, streams);
        result
    }

    /// # Safety
    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
    pub unsafe fn scalar_mul_assign_async<Scalar, T>(
        &self,
        ct: &mut T,
        scalar: Scalar,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
    ) where
        Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
        if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, stream);
+            self.full_propagate_assign_async(ct, streams);
        };

-        self.unchecked_scalar_mul_assign_async(ct, scalar, stream);
+        self.unchecked_scalar_mul_assign_async(ct, scalar, streams);
    }

-    pub fn scalar_mul_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams)
+    pub fn scalar_mul_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams)
    where
        Scalar: ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        T: CudaIntegerRadixCiphertext,
    {
        unsafe {
-            self.scalar_mul_assign_async(ct, scalar, stream);
+            self.scalar_mul_assign_async(ct, scalar, streams);
        }
-        stream.synchronize();
+        streams.synchronize();
    }
 }
Author	SHA1	Message	Date
Guillermo Oyarzun	f3f05f9068	refactor(gpu): random test fft	2024-06-27 10:10:13 +02:00
Agnes Leroy	3a2bb4470f	fix(gpu): fix gpu index in casts, scalar comparison, scalar mul, etc.	2024-06-27 10:08:11 +02:00
Beka Barbakadze	6120fab886	feat(gpu): Implement propagate_single_carry_get_input_carries	2024-06-26 17:34:28 +02:00
Agnes Leroy	53b68619b0	chore(gpu): call nvidia-smi before launching tests on hyperstack	2024-06-26 16:47:29 +02:00
Guillermo Oyarzun	e854823233	refactor(gpu): speedup twiddles reads	2024-06-26 11:30:05 +02:00
sarah el kazdadi	19e00c484b	feat(zk): zk perf improvements	2024-06-26 11:24:11 +02:00
David Testé	818e480dac	chore(ci): publish only one tag for npm packages NPM doesn't accept tags that are similar to a semantic-version compatible string (e.g 0.7.0 or v0.7). We only publish "latest" tag on release manager discretion.	2024-06-26 09:06:26 +02:00
David Testé	a7fc8a90e1	chore(ci): run build workflow on large windows instance	2024-06-25 18:17:26 +02:00
David Testé	3fad6d194c	chore(ci): avoid cancel ongoing benchmarks on main branch	2024-06-25 17:46:24 +02:00
David Testé	23efcb8dd4	chore(bench): fix benchmark naming format for shortint	2024-06-25 17:46:07 +02:00
David Testé	33c69d9d1f	chore(ci): update slab-github-runner action	2024-06-25 12:00:12 +02:00
David Testé	960d287e92	chore(bench): fix display name for gpu unsigned integer operations	2024-06-25 11:59:08 +02:00
Nicolas Sarlin	662e5402a3	chore(doc): add missing doc for a data breaking change	2024-06-24 16:09:26 +02:00