feat(hlapi): add flip operation

feat(integer): add flip operation
Add the flip(condition: BooleanBlock, a: T, b: T) -> (T, T) operation that homomorphically flip/swap two values if the given encrypted boolean encrypts true
2026-01-11 07:38:08 -05:00 · 2025-09-05 16:09:47 +02:00 · 2025-09-05 16:09:47 +02:00 · 2025-09-05 15:34:44 +02:00 · 2025-09-05 10:42:36 +02:00 · 2025-09-05 10:42:36 +02:00
132 changed files with 4939 additions and 1471 deletions
--- a/.github/workflows/benchmark_hpu_hlapi.yml
+++ b/.github/workflows/benchmark_hpu_hlapi.yml
@@ -0,0 +1,98 @@
+# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
+name: Hpu Hlapi Benchmarks
+
+on:
+  workflow_dispatch:
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+
+permissions: {}
+
+jobs:
+  hlapi-benchmarks-hpu:
+    name: Execute HLAPI benchmarks for HPU backend
+    runs-on: v80-desktop
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    timeout-minutes: 1440  # 24 hours
+    steps:
+      # Needed as long as hw_regmap repository is private
+      - name: Configure SSH
+        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          lfs: true
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make pull_hpu_files
+          export V80_SERIAL_NUMBER=XFL12E4XJXWK
+          source /opt/xilinx/Vivado/2024.2/settings64.sh
+          make bench_hlapi_erc20_hpu
+          make bench_hlapi_hpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "hpu_x1" \
+          --backend hpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_hlapi_benchmarks
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -3,6 +3,15 @@ name: Hpu Integer Benchmarks

 on:
  workflow_dispatch:
+    inputs:
+      bench_type:
+        description: "Benchmarks type"
+        type: choice
+        default: both
+        options:
+          - latency
+          - throughput
+          - both

 env:
  CARGO_TERM_COLOR: always
@@ -14,13 +23,46 @@ env:
 permissions: {}

 jobs:
+  prepare-matrix:
+    name: Prepare operations matrix
+    runs-on: v80-desktop
+    outputs:
+      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
+    steps:
+      - name: Set benchmark types
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          if [[ -z $INPUTS_BENCH_TYPE || "${INPUTS_BENCH_TYPE}" == "both" ]]; then
+            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
+          else
+            echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
+          fi
+        env:
+          INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}
+
+      - name: Default benchmark type
+        if: github.event_name != 'workflow_dispatch'
+        run: |
+          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
+
+
+      - name: Set benchmark types output
+        id: set_bench_type
+        run: | # zizmor: ignore[template-injection] this env variable is safe
+          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  integer-benchmarks-hpu:
    name: Execute integer & erc20 benchmarks for HPU backend
+    needs: prepare-matrix
    runs-on: v80-desktop
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    timeout-minutes: 1440  # 24 hours
+    strategy:
+      max-parallel: 1
+      matrix:
+        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      # Needed as long as hw_regmap repository is private
      - name: Configure SSH
@@ -63,8 +105,11 @@ jobs:
      - name: Run benchmarks
        run: |
          make pull_hpu_files
-          make bench_integer_hpu
-          make bench_hlapi_erc20_hpu
+          export V80_SERIAL_NUMBER=XFL12E4XJXWK
+          source /opt/xilinx/Vivado/2024.2/settings64.sh
+          make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
+        env:
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Parse results
        run: |
@@ -76,14 +121,16 @@ jobs:
          --branch "${REF_NAME}" \
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${BENCH_DATE}" \
-          --walk-subdirs
+          --walk-subdirs \
+          --bench-type "${BENCH_TYPE}"
        env:
          REF_NAME: ${{ github.ref_name }}
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_integer_benchmarks
+          name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
          path: ${{ env.RESULTS_FILENAME }}

      - name: Send data to Slab
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -11,6 +11,7 @@ env:
  CARGO_TERM_COLOR: always
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -31,7 +32,7 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
@@ -45,18 +46,46 @@ jobs:
              - tfhe-ntt/**
              - '.github/workflows/cargo_test_ntt.yml'

-  cargo-tests-ntt:
+  setup-instance:
    needs: should-run
    if: needs.should-run.outputs.ntt_test == 'true'
+    runs-on: ubuntu-latest
+    outputs:
+      matrix_os: ${{ steps.set-os-matrix.outputs.matrix_os }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+      - name: Set os matrix
+        id: set-os-matrix
+        env:
+          SLAB_INSTANCE: ${{ steps.start-remote-instance.outputs.label }}
+        run: |
+          INSTANCE_TO_USE="${SLAB_INSTANCE:-ubuntu-latest}"
+          echo "matrix_os=[\"${INSTANCE_TO_USE}\", \"macos-latest\", \"windows-latest\"]" >> "$GITHUB_OUTPUT"
+
+  cargo-tests-ntt:
+    needs: [should-run, setup-instance]
+    if: needs.should-run.outputs.ntt_test == 'true'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
@@ -72,16 +101,16 @@ jobs:
        run: make test_ntt_no_std

  cargo-tests-ntt-nightly:
-    needs: should-run
+    needs: [should-run, setup-instance]
    if: needs.should-run.outputs.ntt_test == 'true'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
    steps:
      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
@@ -97,7 +126,7 @@ jobs:
        run: make test_ntt_no_std_nightly

  cargo-tests-ntt-successful:
-    needs: [ should-run, cargo-tests-ntt, cargo-tests-ntt-nightly ]
+    needs: [should-run, cargo-tests-ntt, cargo-tests-ntt-nightly]
    if: ${{ always() }}
    runs-on: ubuntu-latest
    steps:
@@ -120,3 +149,28 @@ jobs:
        run: |
          echo "Some tfhe-ntt tests failed"
          exit 1
+
+  teardown-instance:
+    name: Teardown instance (cargo-tests-ntt-successful)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [setup-instance, cargo-tests-ntt-successful]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cargo-tests-ntt) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -100,7 +100,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -111,7 +111,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -162,10 +162,10 @@ jobs:
          cargo install sqlx-cli

      - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@de808b1eea699e761c404bda44ba8f21aba30b2c
+        uses: foundry-rs/foundry-toolchain@82dee4ba654bd2146511f85f0d013af94670c4de

      - name: Cache cargo
-        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
        with:
          path: |
            ~/.cargo/registry
@@ -175,7 +175,7 @@ jobs:
          restore-keys: ${{ runner.os }}-cargo-

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
+        uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
@@ -186,7 +186,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor

      - name: Use Node.js
-        uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
        with:
          node-version: 20.x

@@ -257,7 +257,7 @@ jobs:
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ rand = "0.8"
 rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.100"
+getrandom = "0.2.8"

 [profile.bench]
 lto = "fat"
--- a/16
+++ b/16
@@ -1312,11 +1312,11 @@ bench_signed_integer_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
 bench_integer_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
@@ -1497,11 +1497,13 @@ bench_hlapi_gpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --

-.PHONY: bench_hlapi_hpu # Run benchmarks for integer operations on HPU
+.PHONY: bench_hlapi_hpu # Run benchmarks for HLAPI operations on HPU
 bench_hlapi_hpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi \
-	--features=integer,hpu,hpu-v80,internal-keycache,nightly-avx512 -p tfhe-benchmark --
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --

 .PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
 bench_hlapi_erc20: install_rs_check_toolchain
@@ -1529,11 +1531,11 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain

 .PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
 bench_hlapi_erc20_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
--- a/README.md
+++ b/README.md
@@ -201,11 +201,9 @@ When a new update is published in the Lattice Estimator, we update parameters ac

 ### Security model

-By default, the parameter sets used in the High-Level API with the x86 CPU backend have a failure probability $\le 2^{128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
+By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
 If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).

-The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-128}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].
-
 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf

 [2] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -119,6 +119,8 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,

 int cuda_get_number_of_gpus();

+int cuda_get_number_of_sms();
+
 void cuda_synchronize_device(uint32_t gpu_index);

 void cuda_drop(void *ptr, uint32_t gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -16,24 +16,27 @@ int32_t cuda_setup_multi_gpu(int device_0_id);
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;

-// Macro to define the visitor logic using std::holds_alternative for vectors
-#define GET_VARIANT_ELEMENT(variant, index)                                    \
-  [&] {                                                                        \
-    if (std::holds_alternative<std::vector<Torus *>>(variant)) {               \
-      return std::get<std::vector<Torus *>>(variant)[index];                   \
-    } else {                                                                   \
-      return std::get<Torus *>(variant);                                       \
-    }                                                                          \
-  }()
-// Macro to define the visitor logic using std::holds_alternative for vectors
-#define GET_VARIANT_ELEMENT_64BIT(variant, index)                              \
-  [&] {                                                                        \
-    if (std::holds_alternative<std::vector<uint64_t *>>(variant)) {            \
-      return std::get<std::vector<uint64_t *>>(variant)[index];                \
-    } else {                                                                   \
-      return std::get<uint64_t *>(variant);                                    \
-    }                                                                          \
-  }()
+/// get_variant_element() resolves access when the input may be either a single
+/// pointer or a vector of pointers. If the variant holds a single pointer, the
+/// index is ignored and that pointer is returned; if it holds a vector, the
+/// element at `index` is returned.
+///
+/// This function replaces the previous macro:
+/// - Easier to debug and read than a macro
+/// - Deduces the pointer type from the variant (no need to name a Torus type
+/// explicitly)
+/// - Defined in a header, so it’s eligible for inlining by the optimizer
+template <typename Torus>
+inline Torus
+get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
+                    size_t index) {
+  if (std::holds_alternative<std::vector<Torus>>(variant)) {
+    return std::get<std::vector<Torus>>(variant)[index];
+  } else {
+    return std::get<Torus>(variant);
+  }
+}
+
 int get_active_gpu_count(int num_inputs, int gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -75,7 +75,7 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory);

 void cuda_integer_compress_radix_ciphertext_128(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -115,8 +115,10 @@ template <typename Torus> struct int_decompression {
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
          decompression_rescale_f, gpu_memory_allocated);
-
-      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count =
+          get_active_gpu_count(num_blocks_to_decompress, gpu_count);
+      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
+                                               active_gpu_count);
    }
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -6,7 +6,6 @@
 #include "integer/radix_ciphertext.h"
 #include "keyswitch/keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
-#include "pbs/programmable_bootstrap_128.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>
@@ -321,10 +320,15 @@ template <typename Torus> struct int_radix_lut {
  std::vector<Torus *> lwe_after_ks_vec;
  std::vector<Torus *> lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec;
+  std::vector<Torus *> lwe_aligned_vec;

  uint32_t *gpu_indexes;
  bool gpu_memory_allocated;

+  cudaEvent_t event_scatter_in;
+  cudaEvent_t *event_scatter_out;
+  cudaEvent_t event_broadcast;
+
  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                uint32_t num_radix_blocks, bool allocate_gpu_memory,
@@ -343,7 +347,6 @@ template <typename Torus> struct int_radix_lut {

    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(gpu_indexes[i]);
      int8_t *gpu_pbs_buffer;
@@ -360,10 +363,21 @@ template <typename Torus> struct int_radix_lut {
      if (i == 0) {
        size_tracker += size;
      }
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      buffer.push_back(gpu_pbs_buffer);
    }

+    // We create the events only if we have multiple GPUs
+    if (active_gpu_count > 1) {
+      event_scatter_in = cuda_create_event(gpu_indexes[0]);
+      event_broadcast = cuda_create_event(gpu_indexes[0]);
+
+      event_scatter_out =
+          (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+      for (int i = 0; i < active_gpu_count; i++) {
+        event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
+      }
+    }
+
    // Allocate LUT
    // LUT is used as a trivial encryption and must be initialized outside
    // this constructor
@@ -382,8 +396,6 @@ template <typename Torus> struct int_radix_lut {

      lut_vec.push_back(lut);
      lut_indexes_vec.push_back(lut_indexes);
-
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
    }

    // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -499,11 +511,8 @@ template <typename Torus> struct int_radix_lut {
      cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
                                           streams[i], gpu_indexes[i],
                                           allocate_gpu_memory);
-
      lut_vec.push_back(lut);
      lut_indexes_vec.push_back(lut_indexes);
-
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
    }

    // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -560,7 +569,6 @@ template <typename Torus> struct int_radix_lut {

    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(gpu_indexes[i]);
      int8_t *gpu_pbs_buffer;
@@ -577,10 +585,19 @@ template <typename Torus> struct int_radix_lut {
      if (i == 0) {
        size_tracker += size;
      }
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      buffer.push_back(gpu_pbs_buffer);
    }
+    // We create the events only if we have multiple GPUs
+    if (active_gpu_count > 1) {
+      event_scatter_in = cuda_create_event(gpu_indexes[0]);
+      event_broadcast = cuda_create_event(gpu_indexes[0]);

+      event_scatter_out =
+          (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+      for (int i = 0; i < active_gpu_count; i++) {
+        event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
+      }
+    }
    // Allocate LUT
    // LUT is used as a trivial encryption and must be initialized outside
    // this constructor
@@ -596,11 +613,8 @@ template <typename Torus> struct int_radix_lut {
      cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
                                           streams[i], gpu_indexes[i],
                                           allocate_gpu_memory);
-
      lut_vec.push_back(lut);
      lut_indexes_vec.push_back(lut_indexes);
-
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
    }

    // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -651,11 +665,9 @@ template <typename Torus> struct int_radix_lut {
    multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
                                lwe_trivial_indexes_vec, num_radix_blocks,
                                size_tracker, allocate_gpu_memory);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
-                               lwe_trivial_indexes_vec, lwe_trivial_indexes,
-                               num_radix_blocks, allocate_gpu_memory);
-
+    multi_gpu_copy_array_from_cpu_async(
+        streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
+        h_lwe_indexes_in, num_radix_blocks, allocate_gpu_memory);
    // Keyswitch
    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -712,29 +724,87 @@ template <typename Torus> struct int_radix_lut {

  // Broadcast luts from device gpu_indexes[0] to all active gpus
  void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
-    int active_device = cuda_get_device();
+    // We only do broadcast if there are more than 1 active GPU
+    if (active_gpu_count > 1) {
+      int active_device = cuda_get_device();

-    uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
+      uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;

-    auto src_lut = lut_vec[0];
-    auto src_lut_indexes = lut_indexes_vec[0];
+      auto src_lut = lut_vec[0];
+      auto src_lut_indexes = lut_indexes_vec[0];

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    for (uint i = 0; i < active_gpu_count; i++) {
-      if (gpu_indexes[i] != gpu_indexes[0]) {
-        auto dst_lut = lut_vec[i];
-        auto dst_lut_indexes = lut_indexes_vec[i];
-        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
-            gpu_indexes[i], gpu_memory_allocated);
-        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
-            streams[i], gpu_indexes[i], gpu_memory_allocated);
+      cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
+      for (uint i = 0; i < active_gpu_count; i++) {
+        if (gpu_indexes[i] != gpu_indexes[0]) {
+          cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
+          auto dst_lut = lut_vec[i];
+          auto dst_lut_indexes = lut_indexes_vec[i];
+          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+              dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
+              gpu_indexes[i], gpu_memory_allocated);
+          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+              dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+              streams[i], gpu_indexes[i], gpu_memory_allocated);
+        }
      }
+      // Ensure the device set at the end of this method is the same as it was
+      // set at the beginning
+      cuda_set_device(active_device);
+    }
+  }
+  // Broadcast luts from device gpu_indexes[0] to all active gpus
+  void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                     uint32_t new_active_gpu_count,
+                     bool broadcast_lut_values = true) {
+    // We only do broadcast if there are more than 1 active GPU
+    if (new_active_gpu_count > 1) {
+      int active_device = cuda_get_device();
+
+      uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
+
+      auto src_lut = lut_vec[0];
+      auto src_lut_indexes = lut_indexes_vec[0];
+      if (active_gpu_count > 1)
+        cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
+      for (uint i = 0; i < new_active_gpu_count; i++) {
+        if (gpu_indexes[i] != gpu_indexes[0]) {
+          cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
+          if (broadcast_lut_values) {
+            auto dst_lut = lut_vec[i];
+            cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+                dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
+                streams[i], gpu_indexes[i], gpu_memory_allocated);
+          }
+          auto dst_lut_indexes = lut_indexes_vec[i];
+          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+              dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+              streams[i], gpu_indexes[i], gpu_memory_allocated);
+        }
+      }
+      // Ensure the device set at the end of this method is the same as it was
+      // set at the beginning
+      cuda_set_device(active_device);
+    }
+  }
+
+  void allocate_lwe_vector_for_non_trivial_indexes(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t active_gpu_count, uint64_t max_num_radix_blocks,
+      uint64_t &size_tracker, bool allocate_gpu_memory) {
+    // We need to create the auxiliary array only in GPU 0
+    lwe_aligned_vec.resize(active_gpu_count);
+    for (uint i = 0; i < active_gpu_count; i++) {
+      uint64_t size_tracker_on_array_i = 0;
+      auto inputs_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(max_num_radix_blocks, i, active_gpu_count));
+      Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
+          inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
+          streams[0], gpu_indexes[0], size_tracker_on_array_i,
+          allocate_gpu_memory);
+      lwe_aligned_vec[i] = d_array;
+      size_tracker += size_tracker_on_array_i;
    }
-    // Ensure the device set at the end of this method is the same as it was set
-    // at the beginning
-    cuda_set_device(active_device);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -746,7 +816,6 @@ template <typename Torus> struct int_radix_lut {
      cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
                                         gpu_indexes[i], gpu_memory_allocated);
    }
-
    cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
@@ -793,6 +862,23 @@ template <typename Torus> struct int_radix_lut {
      lwe_after_ks_vec.clear();
      lwe_after_pbs_vec.clear();
      lwe_trivial_indexes_vec.clear();
+      if (active_gpu_count > 1) {
+        for (uint i = 0; i < active_gpu_count; i++) {
+          cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+          cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
+        }
+        cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
+        cuda_event_destroy(event_broadcast, gpu_indexes[0]);
+        free(event_scatter_out);
+      }
+      if (lwe_aligned_vec.size() > 0) {
+        for (uint i = 0; i < active_gpu_count; i++) {
+          cuda_drop_with_size_tracking_async(lwe_aligned_vec[i], streams[0],
+                                             gpu_indexes[0],
+                                             gpu_memory_allocated);
+        }
+        lwe_aligned_vec.clear();
+      }
    }
    free(h_lut_indexes);
    free(degrees);
@@ -841,6 +927,8 @@ template <typename InputTorus> struct int_noise_squashing_lut {

  bool using_trivial_lwe_indexes = true;
  bool gpu_memory_allocated;
+  std::vector<InputTorus *> lwe_aligned_scatter_vec;
+  std::vector<__uint128_t *> lwe_aligned_gather_vec;
  // noise squashing constructor
  int_noise_squashing_lut(cudaStream_t const *streams,
                          uint32_t const *input_gpu_indexes, uint32_t gpu_count,
@@ -876,11 +964,11 @@ template <typename InputTorus> struct int_noise_squashing_lut {
          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
      int8_t *gpu_pbs_buffer;
      uint64_t size = 0;
-      execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
-                              params.small_lwe_dimension, params.glwe_dimension,
-                              params.polynomial_size, params.pbs_level,
-                              num_radix_blocks_on_gpu, allocate_gpu_memory,
-                              params.noise_reduction_type, size);
+      execute_scratch_pbs<__uint128_t>(
+          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
+          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
+          params.grouping_factor, num_radix_blocks_on_gpu, params.pbs_type,
+          allocate_gpu_memory, params.noise_reduction_type, size);
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      if (i == 0) {
        size_tracker += size;
@@ -982,7 +1070,10 @@ template <typename InputTorus> struct int_noise_squashing_lut {
                                              &pbs_buffer[i]);
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
    }
-
+    if (lwe_aligned_gather_vec.size() > 0) {
+      multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_gather_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_scatter_vec);
+    }
    multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
    multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
    multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
@@ -1045,7 +1136,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
        lut->get_lut_indexes(0, 0), h_lut_indexes,
        num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
        gpu_indexes[0], allocate_gpu_memory);
-    lut->broadcast_lut(streams, gpu_indexes);
+
+    auto active_gpu_count =
+        get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    /**
     * the input indexes should take the first bits_per_block PBS to target
@@ -1071,6 +1165,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {

    lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
                         h_lwe_indexes_out);
+    lut->allocate_lwe_vector_for_non_trivial_indexes(
+        streams, gpu_indexes, active_gpu_count,
+        num_radix_blocks * bits_per_block, size_tracker, allocate_gpu_memory);

    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    free(h_lwe_indexes_in);
@@ -1212,7 +1309,9 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
-    mux_lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count_mux =
+        get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
+    mux_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_mux);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
@@ -1222,7 +1321,10 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
-    cleaning_lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count_cleaning =
+        get_active_gpu_count(num_radix_blocks, gpu_count);
+    cleaning_lut->broadcast_lut(streams, gpu_indexes,
+                                active_gpu_count_cleaning);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1311,8 +1413,8 @@ template <typename Torus> struct int_fullprop_buffer {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0],
        gpu_indexes[0], allocate_gpu_memory);
-
-    lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(2, gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -1447,9 +1549,11 @@ template <typename Torus> struct int_overflowing_sub_memory {
        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        f_message_acc, gpu_memory_allocated);

-    luts_array->broadcast_lut(streams, gpu_indexes);
-    luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes);
-    message_acc->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
+                                               active_gpu_count);
+    message_acc->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1558,9 +1662,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    uint32_t total_messages = 0;
    current_columns.next_accumulation(total_ciphertexts, total_messages,
                                      _needs_processing);
-
+    uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
    if (!mem_reuse) {
-      uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
      if (total_ciphertexts > 0 ||
          reduce_degrees_for_single_carry_propagation) {
        uint64_t size_tracker = 0;
@@ -1568,6 +1671,11 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
                                     pbs_count, true, size_tracker);
        allocated_luts_message_carry = true;
+        auto active_gpu_count =
+            get_active_gpu_count(this->max_total_blocks_in_vec, gpu_count);
+        luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
+            streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
+            size_tracker, true);
      }
    }
    if (allocated_luts_message_carry) {
@@ -1595,7 +1703,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
          luts_message_carry->get_max_degree(1), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
          lut_f_carry, gpu_memory_allocated);
-      luts_message_carry->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count_mc = get_active_gpu_count(pbs_count, gpu_count);
+      luts_message_carry->broadcast_lut(streams, gpu_indexes,
+                                        active_gpu_count_mc);
    }
  }
  int_sum_ciphertexts_vec_memory(
@@ -1615,6 +1725,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->allocated_luts_message_carry = false;
    this->reduce_degrees_for_single_carry_propagation =
        reduce_degrees_for_single_carry_propagation;
+
    setup_index_buffers(streams, gpu_indexes, size_tracker);
    // because we setup_lut in host function for sum_ciphertexts to save memory
    // the size_tracker is topped up here to have a max bound on the used memory
@@ -1662,6 +1773,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->current_blocks = current_blocks;
    this->small_lwe_vector = small_lwe_vector;
    this->luts_message_carry = reused_lut;
+    this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
+        streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
+        size_tracker, allocate_gpu_memory);
    setup_index_buffers(streams, gpu_indexes, size_tracker);
  }

@@ -1745,8 +1859,9 @@ template <typename Torus> struct int_seq_group_prop_memory {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
        streams[0], gpu_indexes[0], allocate_gpu_memory);
-
-    lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_seq_luts, gpu_count);
+    lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
+                                            active_gpu_count);
    free(h_seq_lut_indexes);
  };
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1801,8 +1916,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
        lut_hillis_steele->get_degree(0), lut_hillis_steele->get_max_degree(0),
        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        f_lut_hillis_steele, gpu_memory_allocated);
-
-    lut_hillis_steele->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_groups, gpu_count);
+    lut_hillis_steele->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  };
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
@@ -1978,8 +2093,9 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
        gpu_indexes[0], allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-
-    luts_array_first_step->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes,
+                                         active_gpu_count);
  };
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
@@ -2240,7 +2356,9 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
        allocate_gpu_memory);
-    luts_array_second_step->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes,
+                                          active_gpu_count);

    if (use_sequential_algorithm_to_resolve_group_carries) {

@@ -2259,14 +2377,17 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {

  // needed for the division to update the lut indexes
  void update_lut_indexes(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
-                          Torus *new_scalars, uint32_t new_num_blocks) {
+                          uint32_t const *gpu_indexes, uint32_t gpu_count,
+                          Torus *new_lut_indexes, Torus *new_scalars,
+                          uint32_t new_num_blocks) {
    Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams[0], gpu_indexes[0], gpu_memory_allocated);
-
-    luts_array_second_step->broadcast_lut(streams, gpu_indexes);
+    auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
+    // We just need to update the lut indexes so we use false here
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes,
+                                          new_active_gpu_count, false);

    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
@@ -2431,7 +2552,9 @@ template <typename Torus> struct int_sc_prop_memory {
          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
          gpu_memory_allocated);

-      lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+      lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
+                                            active_gpu_count);
    }

    // For the final cleanup in case of overflow or carry (it seems that I can)
@@ -2500,7 +2623,9 @@ template <typename Torus> struct int_sc_prop_memory {
          (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
          allocate_gpu_memory);
    }
-    lut_message_extract->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count =
+        get_active_gpu_count(num_radix_blocks + 1, gpu_count);
+    lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  };

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -2695,19 +2820,23 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
        gpu_indexes[0], allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-
-    luts_array_first_step->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes,
+                                         active_gpu_count);
  };

  // needed for the division to update the lut indexes
  void update_lut_indexes(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
-                          uint32_t new_num_blocks) {
+                          uint32_t const *gpu_indexes, uint32_t gpu_count,
+                          Torus *new_lut_indexes, uint32_t new_num_blocks) {
    Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams[0], gpu_indexes[0], gpu_memory_allocated);
-    luts_array_first_step->broadcast_lut(streams, gpu_indexes);
+    auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
+    // We just need to update the lut indexes so we use false here
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes,
+                                         new_active_gpu_count, false);
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
@@ -2806,8 +2935,9 @@ template <typename Torus> struct int_borrow_prop_memory {
        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_message_extract,
        gpu_memory_allocated);
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);

-    lut_message_extract->broadcast_lut(streams, gpu_indexes);
+    lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    if (compute_overflow) {
      lut_borrow_flag = new int_radix_lut<Torus>(
@@ -2823,8 +2953,7 @@ template <typename Torus> struct int_borrow_prop_memory {
          lut_borrow_flag->get_degree(0), lut_borrow_flag->get_max_degree(0),
          glwe_dimension, polynomial_size, message_modulus, carry_modulus,
          f_borrow_flag, gpu_memory_allocated);
-
-      lut_borrow_flag->broadcast_lut(streams, gpu_indexes);
+      lut_borrow_flag->broadcast_lut(streams, gpu_indexes, active_gpu_count);
    }

    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
@@ -2852,15 +2981,15 @@ template <typename Torus> struct int_borrow_prop_memory {

  // needed for the division to update the lut indexes
  void update_lut_indexes(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes,
+                          uint32_t const *gpu_indexes, uint32_t gpu_count,
                          Torus *first_indexes_for_div,
                          Torus *second_indexes_for_div, Torus *scalars_for_div,
                          uint32_t new_num_blocks) {
    shifted_blocks_borrow_state_mem->update_lut_indexes(
-        streams, gpu_indexes, first_indexes_for_div, new_num_blocks);
+        streams, gpu_indexes, gpu_count, first_indexes_for_div, new_num_blocks);
    prop_simu_group_carries_mem->update_lut_indexes(
-        streams, gpu_indexes, second_indexes_for_div, scalars_for_div,
-        new_num_blocks);
+        streams, gpu_indexes, gpu_count, second_indexes_for_div,
+        scalars_for_div, new_num_blocks);
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
@@ -2991,7 +3120,10 @@ template <typename Torus> struct int_mul_memory {
          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          zero_out_predicate_lut_f, gpu_memory_allocated);
-      zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes);
+
+      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes,
+                                            active_gpu_count);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3064,8 +3196,8 @@ template <typename Torus> struct int_mul_memory {
          streams[0], gpu_indexes[0],
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
-
-    luts_array->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(total_block_count, gpu_count);
+    luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3197,7 +3329,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          shift_lut_f, gpu_memory_allocated);
-      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -3281,13 +3414,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          shift_lut_f, gpu_memory_allocated);
-      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (auto &buffer : lut_buffers_bivariate) {
      buffer->release(streams, gpu_indexes, gpu_count);
      delete buffer;
@@ -3385,7 +3520,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          shift_last_block_lut_univariate->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
-      shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+      shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
+                                                     active_gpu_count);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -3410,7 +3547,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        padding_block_lut_f, gpu_memory_allocated);
-    padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+    padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
+                                                active_gpu_count);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -3449,7 +3588,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          blocks_lut_f, gpu_memory_allocated);
-      shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes,
+                                                active_gpu_count);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
@@ -3457,6 +3598,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint j = 0; j < active_gpu_count; j++) {
      cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
      cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
@@ -3563,9 +3705,13 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
        allocate_gpu_memory);
-
-    predicate_lut->broadcast_lut(streams, gpu_indexes);
-    message_extract_lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count_pred =
+        get_active_gpu_count(2 * num_radix_blocks, gpu_count);
+    predicate_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_pred);
+    auto active_gpu_count_msg =
+        get_active_gpu_count(num_radix_blocks, gpu_count);
+    message_extract_lut->broadcast_lut(streams, gpu_indexes,
+                                       active_gpu_count_msg);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3637,7 +3783,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

-    is_max_value->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(max_chunks, gpu_count);
+    is_max_value->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3697,7 +3844,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, operator_f, gpu_memory_allocated);

-    operator_lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    operator_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    // f(x) -> x == 0
    Torus total_modulus = params.message_modulus * params.carry_modulus;
@@ -3715,7 +3863,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);

-    is_non_zero_lut->broadcast_lut(streams, gpu_indexes);
+    is_non_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -3734,8 +3882,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f, gpu_memory_allocated);
    }
-
-    scalar_comparison_luts->broadcast_lut(streams, gpu_indexes);
+    scalar_comparison_luts->broadcast_lut(streams, gpu_indexes,
+                                          active_gpu_count);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3806,8 +3954,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        block_selector_f, gpu_memory_allocated);
-
-    tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3994,8 +4142,7 @@ template <typename Torus> struct int_comparison_buffer {
        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
-
-    identity_lut->broadcast_lut(streams, gpu_indexes);
+    identity_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -4012,7 +4159,7 @@ template <typename Torus> struct int_comparison_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_zero_f, gpu_memory_allocated);

-    is_zero_lut->broadcast_lut(streams, gpu_indexes);
+    is_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -4094,8 +4241,8 @@ template <typename Torus> struct int_comparison_buffer {
          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
-
-      signed_lut->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+      signed_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
    }
  }

@@ -4144,6 +4291,7 @@ template <typename Torus> struct int_comparison_buffer {
      delete signed_msb_lut;
      delete tmp_trivial_sign_block;
    }
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint j = 0; j < active_gpu_count; j++) {
      cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
@@ -4310,17 +4458,23 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          streams, gpu_indexes, gpu_count, params, 1, num_blocks,
          allocate_gpu_memory, size_tracker);

-      int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], masking_luts_1[i]->get_lut(0, 0),
+          masking_luts_1[i]->get_degree(0),
+          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
+      auto active_gpu_count1 = get_active_gpu_count(1, gpu_count);
+      masking_luts_1[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count1);

-      for (int j = 0; j < 2; j++) {
-        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
-            luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-            params.glwe_dimension, params.polynomial_size,
-            params.message_modulus, params.carry_modulus, lut_f_masking,
-            gpu_memory_allocated);
-        luts[j]->broadcast_lut(streams, gpu_indexes);
-      }
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], masking_luts_2[i]->get_lut(0, 0),
+          masking_luts_2[i]->get_degree(0),
+          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
+      auto active_gpu_count2 = get_active_gpu_count(num_blocks, gpu_count);
+      masking_luts_2[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count2);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -4340,13 +4494,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
+    auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
    for (int j = 0; j < 2; j++) {
      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      luts[j]->broadcast_lut(streams, gpu_indexes);
+      luts[j]->broadcast_lut(streams, gpu_indexes, active_gpu_count);
    }

    // Give name to closures to improve readability
@@ -4382,7 +4537,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes);
+    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
+                                                          active_gpu_count);
    generate_device_accumulator_bivariate_with_factor<Torus>(
        streams[0], gpu_indexes[0],
        zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
@@ -4391,7 +4547,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes);
+    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
+                                                          active_gpu_count);

    // create and generate zero_out_if_overflow_happened
    zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -4418,7 +4575,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
        gpu_memory_allocated);
-    zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes);
+    zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes,
+                                                    active_gpu_count);
    generate_device_accumulator_bivariate_with_factor<Torus>(
        streams[0], gpu_indexes[0],
        zero_out_if_overflow_happened[1]->get_lut(0, 0),
@@ -4427,10 +4585,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
        gpu_memory_allocated);
-    zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes);
+    zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes,
+                                                    active_gpu_count);

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
+    auto active_gpu_count_for_bits = get_active_gpu_count(1, gpu_count);
    for (int i = 0; i < num_bits_in_message; i++) {
      auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
        return (x == 0 && y == 0) << i;
@@ -4447,7 +4607,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          merge_overflow_flags_luts[i]->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
-      merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes);
+      merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes,
+                                                  active_gpu_count_for_bits);
    }
  }

@@ -4703,6 +4864,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
    delete[] merge_overflow_flags_luts;

    // release sub streams
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -4764,7 +4926,7 @@ template <typename Torus> struct int_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    switch (op) {
    case BITAND:
    case BITOR:
@@ -4791,7 +4953,7 @@ template <typename Torus> struct int_bitop_buffer {
            lut->get_max_degree(0), params.glwe_dimension,
            params.polynomial_size, params.message_modulus,
            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
-        lut->broadcast_lut(streams, gpu_indexes);
+        lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
      }
      break;
    default:
@@ -4821,7 +4983,7 @@ template <typename Torus> struct int_bitop_buffer {
            params.polynomial_size, params.message_modulus,
            params.carry_modulus, lut_univariate_scalar_f,
            gpu_memory_allocated);
-        lut->broadcast_lut(streams, gpu_indexes);
+        lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
      }
    }
  }
@@ -5106,7 +5268,10 @@ template <typename Torus> struct int_div_rem_memory {
          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          f_compare_extracted_signed_bits, gpu_memory_allocated);
-      compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count_cmp =
+          get_active_gpu_count(1, gpu_count); // only 1 block needed
+      compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes,
+                                             active_gpu_count_cmp);
    }
  }

@@ -5148,6 +5313,7 @@ template <typename Torus> struct int_div_rem_memory {
      delete compare_signed_bits_lut;

      // release sub streams
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
      for (uint i = 0; i < gpu_count; i++) {
        cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
        cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -5776,7 +5942,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
    this->allocate_gpu_memory = allocate_gpu_memory;
    this->direction = direction;
    this->bit_value = bit_value;
-
+    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    this->univ_lut_mem = new int_radix_lut<Torus>(
        streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
        allocate_gpu_memory, size_tracker);
@@ -5815,7 +5981,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);

    if (allocate_gpu_memory) {
-      univ_lut_mem->broadcast_lut(streams, gpu_indexes);
+      univ_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
    }

    auto generate_bi_lut_lambda =
@@ -5834,7 +6000,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);

    if (allocate_gpu_memory) {
-      biv_lut_mem->broadcast_lut(streams, gpu_indexes);
+      biv_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
    }

    this->tmp_ct = new CudaRadixCiphertextFFI;
@@ -6052,7 +6218,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
    cuda_memcpy_async_to_gpu(luts->get_lut_indexes(0, 0), this->h_lut_indexes,
                             num_blocks * sizeof(Torus), streams[0],
                             gpu_indexes[0]);
-    luts->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
+    luts->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    free(h_corrections);
  }
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -232,8 +232,13 @@ template <typename Torus> struct zk_expand_mem {
        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
        allocate_gpu_memory);

-    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
+    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
+                                                  active_gpu_count);

+    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
+        streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
+        allocate_gpu_memory);
    // The expanded LWEs will always be on the casting key format
    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
        num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -157,12 +157,12 @@ void execute_keyswitch_async(cudaStream_t const *streams,
  for (uint i = 0; i < gpu_count; i++) {
    int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);

-    Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+    Torus *current_lwe_array_out = get_variant_element(lwe_array_out, i);
    Torus *current_lwe_output_indexes =
-        GET_VARIANT_ELEMENT(lwe_output_indexes, i);
-    Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        get_variant_element(lwe_output_indexes, i);
+    Torus *current_lwe_array_in = get_variant_element(lwe_array_in, i);
    Torus *current_lwe_input_indexes =
-        GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+        get_variant_element(lwe_input_indexes, i);

    // Compute Keyswitch
    host_keyswitch_lwe_ciphertext_vector<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -202,9 +202,9 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(

  auto stride_KSK_buffer = glwe_accumulator_size * level_count;

-  // Shared memory requirement is 4096, 8192, and 16384 bytes respectively for
-  // 32, 64, and 128-bit Torus elements We want to keep this as a sanity check
  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
+  // Shared memory requirement is 4096, 8192, and 16384 bytes respectively for
+  // 32, 64, and 128-bit Torus elements
  // Sanity check: the shared memory size is a constant defined by the algorithm
  GPU_ASSERT(shared_mem_size <= 1024 * sizeof(Torus),
             "GEMM kernel error: shared memory required might be too large");
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,15 +1,88 @@
 #include "device.h"
 #include <cstdint>
 #include <cuda_runtime.h>
+#include <mutex>

 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
  return static_cast<uint32_t>(device);
 }
+std::mutex pool_mutex;
+bool mem_pools_enabled = false;
+
+// We use memory pools to reduce some overhead of memory allocations due
+// to our scratch/release pattern. This function is the simplest way of using
+// mempools, it modifies the default memory pool to use a threshold of 5% of the
+// free memory:
+//  - Enabled opportunistic reuse to maximize reuse in malloc/free patterns
+//  - Prevent memory from being released back to the OS too soon if is within
+//  our threshold
+//  - Warm up the pool by allocating and freeing a large block of memory
+// This function is called only once, the first time a GPU is set, and it
+// configures all the GPUs available.
+// We have measured an improvement of around 10% in our integer operations,
+// especially the ones involving many allocations.
+// We tested more complex configurations of mempools, but they did not yield
+// better results.
+void cuda_setup_mempool(uint32_t caller_gpu_index) {
+  if (!mem_pools_enabled) {
+    pool_mutex.lock();
+    if (mem_pools_enabled)
+      return; // If mem pools are already enabled, we don't need to do anything
+
+    // We do it only once for all GPUs
+    mem_pools_enabled = true;
+    uint32_t num_gpus = cuda_get_number_of_gpus();
+    for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
+      cuda_set_device(gpu_index);
+
+      size_t total_mem, free_mem;
+      check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
+
+      // If we have more than 5% of free memory, we can set up the mempool
+      uint64_t mem_pool_threshold = total_mem / 20; // 5% of total memory
+      mem_pool_threshold =
+          mem_pool_threshold - (mem_pool_threshold % 1024); // Align to 1KB
+      if (mem_pool_threshold < free_mem) {
+        // Get default memory pool
+        cudaMemPool_t default_pool;
+        check_cuda_error(cudaDeviceGetDefaultMemPool(&default_pool, gpu_index));
+
+        // Enable opportunistic reuse
+        int reuse = 1;
+        check_cuda_error(cudaMemPoolSetAttribute(
+            default_pool, cudaMemPoolReuseAllowOpportunistic, &reuse));
+
+        // Prevent memory from being released back to the OS too soon
+        check_cuda_error(cudaMemPoolSetAttribute(
+            default_pool, cudaMemPoolAttrReleaseThreshold,
+            &mem_pool_threshold));
+
+        // Warm up the pool by allocating and freeing a large block
+        cudaStream_t stream;
+        stream = cuda_create_stream(gpu_index);
+        void *warmup_ptr = nullptr;
+        warmup_ptr = cuda_malloc_async(mem_pool_threshold, stream, gpu_index);
+        cuda_drop_async(warmup_ptr, stream, gpu_index);
+
+        // Sync to ensure pool is grown
+        cuda_synchronize_stream(stream, gpu_index);
+
+        // Clean up
+        cuda_destroy_stream(stream, gpu_index);
+      }
+    }
+    // We return to the original gpu_index
+    cuda_set_device(caller_gpu_index);
+    pool_mutex.unlock();
+  }
+}

 void cuda_set_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
+  // Mempools are initialized only once in all the GPUS available
+  cuda_setup_mempool(gpu_index);
 }

 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
@@ -329,6 +402,13 @@ int cuda_get_number_of_gpus() {
  return num_gpus;
 }

+int cuda_get_number_of_sms() {
+  int num_sms = 0;
+  check_cuda_error(
+      cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
+  return num_sms;
+}
+
 /// Drop a cuda array
 void cuda_drop(void *ptr, uint32_t gpu_index) {
  cuda_set_device(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -148,7 +148,8 @@ __host__ void are_all_comparisons_block_true(
        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
                                 h_lut_indexes, num_chunks * sizeof(Torus),
                                 streams[0], gpu_indexes[0]);
-        is_max_value_lut->broadcast_lut(streams, gpu_indexes);
+        auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
+        is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
      }
      lut = is_max_value_lut;
    }
@@ -167,7 +168,10 @@ __host__ void are_all_comparisons_block_true(
                               is_max_value_lut->h_lut_indexes,
                               is_max_value_lut->num_blocks * sizeof(Torus),
                               streams[0], gpu_indexes[0]);
-      is_max_value_lut->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count_is_max =
+          get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
+      is_max_value_lut->broadcast_lut(streams, gpu_indexes,
+                                      active_gpu_count_is_max, false);
      reset_radix_ciphertext_blocks(lwe_array_out, 1);
      return;
    } else {
@@ -499,7 +503,9 @@ __host__ void tree_sign_reduction(
      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, f, true);
-  last_lut->broadcast_lut(streams, gpu_indexes);
+
+  auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+  last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

  // Last leaf
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -100,7 +100,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
      pbs_type, compression_glwe_dimension, compression_polynomial_size,
      (compression_glwe_dimension + 1) * compression_polynomial_size,
      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
-      carry_modulus, allocate_gpu_memory);
+      carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);

  return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -112,14 +112,15 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
+    bool allocate_gpu_memory) {

  // 128-bit decompression doesn't run PBSs, so we don't need encryption_params
  int_radix_params compression_params(
      PBS_TYPE::CLASSICAL, compression_glwe_dimension,
      compression_polynomial_size,
      compression_glwe_dimension * compression_polynomial_size, lwe_dimension,
-      0, 0, 0, 0, 0, message_modulus, carry_modulus, allocate_ms_array);
+      0, 0, 0, 0, 0, message_modulus, carry_modulus,
+      PBS_MS_REDUCTION_T::NO_REDUCTION);

  return scratch_cuda_integer_decompress_radix_ciphertext<__uint128_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -344,7 +344,7 @@ host_integer_decompress(cudaStream_t const *streams,
    auto active_gpu_count =
        get_active_gpu_count(num_blocks_to_decompress, gpu_count);
    if (active_gpu_count == 1) {
-      execute_pbs_async<Torus>(
+      execute_pbs_async<Torus, Torus>(
          streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
          lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
          extracted_lwe, lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
@@ -363,18 +363,21 @@ host_integer_decompress(cudaStream_t const *streams,
          lut->lwe_trivial_indexes_vec;

      /// Make sure all data that should be on GPU 0 is indeed there
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
+      cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+      for (int j = 1; j < active_gpu_count; j++) {
+        cuda_stream_wait_event(streams[j], lut->event_scatter_in,
+                               gpu_indexes[j]);
+      }
      /// With multiple GPUs we push to the vectors on each GPU then when we
      /// gather data to GPU 0 we can copy back to the original indexing
      multi_gpu_scatter_lwe_async<Torus>(
          streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-          extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
-          lut->active_gpu_count, num_blocks_to_decompress,
+          extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+          lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
          compression_params.small_lwe_dimension + 1);

      /// Apply PBS
-      execute_pbs_async<Torus>(
+      execute_pbs_async<Torus, Torus>(
          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
          lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
          lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, nullptr,
@@ -388,13 +391,20 @@ host_integer_decompress(cudaStream_t const *streams,
      /// Copy data back to GPU 0 and release vecs
      multi_gpu_gather_lwe_async<Torus>(
          streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
-          lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-          lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
-          encryption_params.big_lwe_dimension + 1);
+          lwe_after_pbs_vec, lut->lwe_indexes_out,
+          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+          num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);

      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      // other gpus record their events
+      for (int j = 1; j < active_gpu_count; j++) {
+        cuda_event_record(lut->event_scatter_out[j], streams[j],
+                          gpu_indexes[j]);
+      }
+      // GPU 0 waits for all
+      for (int j = 1; j < active_gpu_count; j++) {
+        cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                               gpu_indexes[0]);
      }
    }
  } else {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -311,8 +311,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
          mem_ptr->scalars_for_overflow_sub
              [merged_interesting_remainder->num_radix_blocks - 1];
      mem_ptr->overflow_sub_mem->update_lut_indexes(
-          streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
-          merged_interesting_remainder->num_radix_blocks);
+          streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
+          scalar_indexes, merged_interesting_remainder->num_radix_blocks);
      host_integer_overflowing_sub<uint64_t>(
          streams, gpu_indexes, gpu_count, new_remainder,
          merged_interesting_remainder, interesting_divisor,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -558,7 +558,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr,
        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
@@ -567,16 +567,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }

    /// With multiple GPUs we push to the vectors on each GPU then when we
    /// gather data to GPU 0 we can copy back to the original indexing
+    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
-        big_lwe_dimension + 1);
-
+        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+        lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -586,7 +590,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -595,15 +599,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                      (Torus *)lwe_array_out->ptr,
-                                      lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-                                      lut->using_trivial_lwe_indexes,
-                                      num_radix_blocks, big_lwe_dimension + 1);
-
-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    PUSH_RANGE("gather")
+    multi_gpu_gather_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
    }
  }
  for (uint i = 0; i < num_radix_blocks; i++) {
@@ -665,7 +674,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr,
        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
@@ -674,16 +683,19 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }
    /// With multiple GPUs we push to the vectors on each GPU then when we
    /// gather data to GPU 0 we can copy back to the original indexing
+    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
-        big_lwe_dimension + 1);
-
+        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+        lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -693,7 +705,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -702,15 +714,22 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
        num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
+    PUSH_RANGE("gather")
    multi_gpu_gather_many_lut_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
        lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
        num_many_lut);
+    POP_RANGE()

-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
    }
  }
  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -787,7 +806,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, 1, (Torus *)(lwe_array_out->ptr),
        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
@@ -795,13 +814,17 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }
+    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
-        big_lwe_dimension + 1);
-
+        (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+        lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -811,7 +834,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -820,15 +843,20 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                      (Torus *)(lwe_array_out->ptr),
-                                      lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-                                      lut->using_trivial_lwe_indexes,
-                                      num_radix_blocks, big_lwe_dimension + 1);
-
-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    PUSH_RANGE("gather")
+    multi_gpu_gather_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
    }
  }
  for (uint i = 0; i < num_radix_blocks; i++) {
@@ -1000,7 +1028,6 @@ void generate_device_accumulator_no_encoding(
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-
  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
 }
@@ -1104,8 +1131,7 @@ void generate_device_accumulator_bivariate_with_factor(
      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
      factor);

-  cuda_synchronize_stream(stream, gpu_index);
-  // copy host lut and lut_indexes_vec to device
+  //  copy host lut and lut_indexes_vec to device
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc_bivariate, h_lut,
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
@@ -1137,7 +1163,6 @@ void generate_device_accumulator_with_encoding(
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-
  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
 }
@@ -1486,7 +1511,7 @@ void host_full_propagate_inplace(
        streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector, 1, 2,
        mem_ptr->tmp_small_lwe_vector, 0, 1);

-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, 1, (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
        mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
        mem_ptr->lut->lut_indexes_vec,
@@ -1668,6 +1693,7 @@ __host__ void reduce_signs(
          "than the number of blocks to operate on")

  auto diff_buffer = mem_ptr->diff_buffer;
+  auto active_gpu_count = mem_ptr->active_gpu_count;

  auto params = mem_ptr->params;
  auto glwe_dimension = params.glwe_dimension;
@@ -1697,7 +1723,7 @@ __host__ void reduce_signs(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, reduce_two_orderings_function, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
@@ -1728,7 +1754,7 @@ __host__ void reduce_signs(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, final_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
                       num_sign_blocks, message_modulus);
@@ -1748,7 +1774,7 @@ __host__ void reduce_signs(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, final_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
@@ -1774,7 +1800,8 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams[0], gpu_indexes[0], allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  POP_RANGE()
  return size_tracker;
 }
@@ -1811,7 +1838,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams[0], gpu_indexes[0], allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  POP_RANGE()
  return size_tracker;
 }
@@ -1848,7 +1876,8 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams[0], gpu_indexes[0], allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
  POP_RANGE()
  return size_tracker;
 }
@@ -2344,11 +2373,17 @@ __host__ void integer_radix_apply_noise_squashing_kb(

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
-    execute_pbs_128_async<__uint128_t>(
+    ///
+    /// int_noise_squashing_lut doesn't support a different output or lut
+    /// indexing than the trivial
+    execute_pbs_async<uint64_t, __uint128_t>(
        streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
-        lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
-        lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
+        lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec,
+        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+        ms_noise_reduction_key, lut->pbs_buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
+        0);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -2357,9 +2392,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
    /// gather data to GPU 0 we can copy back to the original indexing
    multi_gpu_scatter_lwe_async<InputTorus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->active_gpu_count,
-        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
+        (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_scatter_vec,
+        lut->active_gpu_count, lwe_array_out->num_radix_blocks,
+        lut->input_big_lwe_dimension + 1);

    execute_keyswitch_async<InputTorus>(
        streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
@@ -2367,19 +2403,23 @@ __host__ void integer_radix_apply_noise_squashing_kb(
        ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
        ks_level, lwe_array_out->num_radix_blocks);

-    execute_pbs_128_async<__uint128_t>(
-        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
-        lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
-        small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, lwe_array_out->num_radix_blocks);
+    /// int_noise_squashing_lut doesn't support a different output or lut
+    /// indexing than the trivial
+    execute_pbs_async<uint64_t, __uint128_t>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lwe_trivial_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
+        lut->pbs_buffer, glwe_dimension, small_lwe_dimension, polynomial_size,
+        pbs_base_log, pbs_level, grouping_factor,
+        lwe_array_out->num_radix_blocks, params.pbs_type, 0, 0);

    /// Copy data back to GPU 0 and release vecs
    /// In apply noise squashing we always use trivial indexes
    multi_gpu_gather_lwe_async<__uint128_t>(
        streams, gpu_indexes, active_gpu_count,
        (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec, nullptr,
-        lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
+        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);

    /// Synchronize all GPUs
    for (uint i = 0; i < active_gpu_count; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -404,7 +404,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
          total_messages);

-      execute_pbs_async<Torus>(
+      execute_pbs_async<Torus, Torus>(
          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
          d_pbs_indexes_out, luts_message_carry->lut_vec,
          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
@@ -415,31 +415,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
          lut_stride);
    } else {
-      Torus *h_lwe_indexes_in_pinned;
-      Torus *h_lwe_indexes_out_pinned;
-      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
-                     total_ciphertexts * sizeof(Torus));
-      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
-                     total_ciphertexts * sizeof(Torus));
-      for (uint32_t i = 0; i < total_ciphertexts; i++) {
-        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
-        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
-      }
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
-          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
-          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      for (uint32_t i = 0; i < total_ciphertexts; i++) {
-        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
-        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
-      }
-      cudaFreeHost(h_lwe_indexes_in_pinned);
-      cudaFreeHost(h_lwe_indexes_out_pinned);

-      luts_message_carry->broadcast_lut(streams, gpu_indexes);
+      // we just need to broadcast the indexes
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
+                                        false);
      luts_message_carry->using_trivial_lwe_indexes = false;

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
@@ -479,7 +458,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
          mem_ptr->params.ks_level, num_radix_blocks);

-      execute_pbs_async<Torus>(
+      execute_pbs_async<Torus, Torus>(
          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
          d_pbs_indexes_out, luts_message_carry->lut_vec,
          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
@@ -491,31 +470,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          lut_stride);
    } else {
      uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
-      Torus *h_lwe_indexes_in_pinned;
-      Torus *h_lwe_indexes_out_pinned;
-      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
-                     num_blocks_in_apply_lut * sizeof(Torus));
-      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
-                     num_blocks_in_apply_lut * sizeof(Torus));
-      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
-        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
-        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
-      }
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
-          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
-          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
-        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
-        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
-      }
-      cudaFreeHost(h_lwe_indexes_in_pinned);
-      cudaFreeHost(h_lwe_indexes_out_pinned);
-
-      luts_message_carry->broadcast_lut(streams, gpu_indexes);
+      // we just need to broadcast the indexes
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
+                                        false);
      luts_message_carry->using_trivial_lwe_indexes = false;

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -34,7 +34,7 @@ void host_integer_grouped_oprf(
  auto lut = mem_ptr->luts;

  if (active_gpu_count == 1) {
-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, (uint32_t)1, (Torus *)(radix_lwe_out->ptr),
        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
        const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
@@ -48,7 +48,10 @@ void host_integer_grouped_oprf(
    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }

    if (!lut->using_trivial_lwe_indexes) {
      PANIC("lut->using_trivial_lwe_indexes should be true");
@@ -56,11 +59,11 @@ void host_integer_grouped_oprf(

    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        active_gpu_count, num_blocks_to_process,
+        seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
        mem_ptr->params.small_lwe_dimension + 1);

-    execute_pbs_async<Torus>(
+    execute_pbs_async<Torus, Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -72,12 +75,18 @@ void host_integer_grouped_oprf(

    multi_gpu_gather_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
-        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-        lut->using_trivial_lwe_indexes, num_blocks_to_process,
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        lut->lwe_aligned_vec, num_blocks_to_process,
        mem_ptr->params.big_lwe_dimension + 1);

-    for (uint32_t i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -47,7 +47,8 @@ __host__ void host_integer_radix_scalar_bitop_kb(
    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                 num_clear_blocks * sizeof(Torus), streams[0],
                                 gpu_indexes[0]);
-    lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -154,7 +154,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -253,7 +254,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -286,8 +288,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
          one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true);
-
-      one_block_lut->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+      one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -434,7 +436,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -540,7 +543,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, lut_f, true);
-    signed_msb_lut->broadcast_lut(streams, gpu_indexes);
+    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+    signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

    CudaRadixCiphertextFFI sign_block;
    as_radix_ciphertext_slice<Torus>(
@@ -588,8 +592,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
          one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true);
-
-      one_block_lut->broadcast_lut(streams, gpu_indexes);
+      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
+      one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -819,7 +823,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
          num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
          gpu_indexes[0]);
    }
-    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
+    auto active_gpu_count =
+        get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
+    // We use false cause we only will broadcast the indexes
+    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
+                                          active_gpu_count, false);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -7,6 +7,7 @@
 #include "device.h"
 #include "fft/bnsmfft.cuh"
 #include "helper_multi_gpu.h"
+#include "pbs/pbs_128_utilities.h"
 #include "pbs/programmable_bootstrap_multibit.h"
 #include "polynomial/polynomial_math.cuh"

@@ -202,15 +203,15 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
  // the buffer in registers to avoid synchronizations and shared memory usage
 }

-template <typename Torus>
+template <typename InputTorus, typename OutputTorus>
 void execute_pbs_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const LweArrayVariant<Torus> &lwe_array_out,
-    const LweArrayVariant<Torus> &lwe_output_indexes,
-    const std::vector<Torus *> lut_vec,
-    const std::vector<Torus *> lut_indexes_vec,
-    const LweArrayVariant<Torus> &lwe_array_in,
-    const LweArrayVariant<Torus> &lwe_input_indexes,
+    uint32_t gpu_count, const LweArrayVariant<OutputTorus> &lwe_array_out,
+    const LweArrayVariant<InputTorus> &lwe_output_indexes,
+    const std::vector<OutputTorus *> lut_vec,
+    const std::vector<InputTorus *> lut_indexes_vec,
+    const LweArrayVariant<InputTorus> &lwe_array_in,
+    const LweArrayVariant<InputTorus> &lwe_input_indexes,
    void *const *bootstrapping_keys,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
@@ -219,8 +220,7 @@ void execute_pbs_async(
    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
    uint32_t num_many_lut, uint32_t lut_stride) {

-  switch (sizeof(Torus)) {
-  case sizeof(uint32_t):
+  if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
    // 32 bits
    switch (pbs_type) {
    case MULTI_BIT:
@@ -238,12 +238,12 @@ void execute_pbs_async(
        // Use the macro to get the correct elements for the current iteration
        // Handles the case when the input/output are scattered through
        // different gpus and when it is not
-        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
-        Torus *current_lwe_output_indexes =
-            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
-        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
-        Torus *current_lwe_input_indexes =
-            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
+        auto current_lwe_output_indexes =
+            get_variant_element(lwe_output_indexes, i);
+        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
+        auto current_lwe_input_indexes =
+            get_variant_element(lwe_input_indexes, i);

        cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
            streams[i], gpu_indexes[i], current_lwe_array_out,
@@ -257,8 +257,7 @@ void execute_pbs_async(
    default:
      PANIC("Error: unsupported cuda PBS type.")
    }
-    break;
-  case sizeof(uint64_t):
+  } else if constexpr (std::is_same_v<OutputTorus, uint64_t>) {
    // 64 bits
    switch (pbs_type) {
    case MULTI_BIT:
@@ -271,12 +270,12 @@ void execute_pbs_async(
        // Use the macro to get the correct elements for the current iteration
        // Handles the case when the input/output are scattered through
        // different gpus and when it is not
-        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
-        Torus *current_lwe_output_indexes =
-            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
-        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
-        Torus *current_lwe_input_indexes =
-            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
+        auto current_lwe_output_indexes =
+            get_variant_element(lwe_output_indexes, i);
+        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
+        auto current_lwe_input_indexes =
+            get_variant_element(lwe_input_indexes, i);

        int gpu_offset =
            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
@@ -300,12 +299,12 @@ void execute_pbs_async(
        // Use the macro to get the correct elements for the current iteration
        // Handles the case when the input/output are scattered through
        // different gpus and when it is not
-        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
-        Torus *current_lwe_output_indexes =
-            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
-        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
-        Torus *current_lwe_input_indexes =
-            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
+        auto current_lwe_output_indexes =
+            get_variant_element(lwe_output_indexes, i);
+        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
+        auto current_lwe_input_indexes =
+            get_variant_element(lwe_input_indexes, i);

        int gpu_offset =
            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
@@ -328,10 +327,81 @@ void execute_pbs_async(
    default:
      PANIC("Error: unsupported cuda PBS type.")
    }
-    break;
-  default:
-    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
-          "moduli are supported.")
+  } else if constexpr (std::is_same_v<OutputTorus, __uint128_t>) {
+    // 128 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      if (grouping_factor == 0)
+        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
+      for (uint i = 0; i < gpu_count; i++) {
+        int num_inputs_on_gpu =
+            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
+        auto current_lwe_output_indexes =
+            get_variant_element(lwe_output_indexes, i);
+        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
+        auto current_lwe_input_indexes =
+            get_variant_element(lwe_input_indexes, i);
+
+        int gpu_offset =
+            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
+        auto d_lut_vector_indexes =
+            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
+            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
+            polynomial_size, grouping_factor, base_log, level_count,
+            num_inputs_on_gpu, num_many_lut, lut_stride);
+      }
+      break;
+    case CLASSICAL:
+      for (uint i = 0; i < gpu_count; i++) {
+        int num_inputs_on_gpu =
+            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
+        auto current_lwe_output_indexes =
+            get_variant_element(lwe_output_indexes, i);
+        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
+        auto current_lwe_input_indexes =
+            get_variant_element(lwe_input_indexes, i);
+
+        int gpu_offset =
+            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
+        auto d_lut_vector_indexes =
+            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
+        void *zeros = nullptr;
+        if (ms_noise_reduction_key != nullptr &&
+            ms_noise_reduction_key->ptr != nullptr)
+          zeros = ms_noise_reduction_key->ptr[i];
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+            streams[i], gpu_indexes[i], current_lwe_array_out, lut_vec[i],
+            current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
+            zeros, pbs_buffer[i], lwe_dimension, glwe_dimension,
+            polynomial_size, base_log, level_count, num_inputs_on_gpu);
+      }
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.")
+    }
+  } else {
+    static_assert(
+        std::is_same_v<OutputTorus, uint32_t> ||
+            std::is_same_v<OutputTorus, uint64_t> ||
+            std::is_same_v<OutputTorus, __uint128_t>,
+        "Cuda error: unsupported modulus size: only 32, 64, or 128-bit integer "
+        "moduli are supported.");
  }
 }

@@ -344,8 +414,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                         bool allocate_gpu_memory,
                         PBS_MS_REDUCTION_T noise_reduction_type,
                         uint64_t &size_tracker) {
-  switch (sizeof(Torus)) {
-  case sizeof(uint32_t):
+  if constexpr (std::is_same_v<Torus, uint32_t>) {
    // 32 bits
    switch (pbs_type) {
    case MULTI_BIT:
@@ -359,8 +428,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    default:
      PANIC("Error: unsupported cuda PBS type.")
    }
-    break;
-  case sizeof(uint64_t):
+  } else if constexpr (std::is_same_v<Torus, uint64_t>) {
    // 64 bits
    switch (pbs_type) {
    case MULTI_BIT:
@@ -379,10 +447,32 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    default:
      PANIC("Error: unsupported cuda PBS type.")
    }
-    break;
-  default:
-    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
-          "moduli are supported.")
+  } else if constexpr (std::is_same_v<Torus, __uint128_t>) {
+    // 128 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      if (grouping_factor == 0)
+        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
+      size_tracker =
+          scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+              stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+              level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
+      break;
+    case CLASSICAL:
+      size_tracker = scratch_cuda_programmable_bootstrap_128(
+          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, level_count, input_lwe_ciphertext_count,
+          allocate_gpu_memory, noise_reduction_type);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.")
+    }
+  } else {
+    static_assert(
+        std::is_same_v<Torus, uint32_t> || std::is_same_v<Torus, uint64_t> ||
+            std::is_same_v<Torus, __uint128_t>,
+        "Cuda error: unsupported modulus size: only 32, 64, or 128-bit integer "
+        "moduli are supported.");
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
@@ -1,45 +0,0 @@
-#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
-#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
-#include "pbs/pbs_128_utilities.h"
-
-static void execute_scratch_pbs_128(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
-    uint64_t &size_tracker_on_gpu) {
-  // The squash noise function receives as input 64-bit integers
-  size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
-      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-      polynomial_size, level_count, input_lwe_ciphertext_count,
-      allocate_gpu_memory, noise_reduction_type);
-}
-template <typename Torus>
-static void execute_pbs_128_async(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
-    const std::vector<Torus *> lut_vector,
-    const LweArrayVariant<uint64_t> &lwe_array_in,
-    void *const *bootstrapping_keys,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
-
-  for (uint32_t i = 0; i < gpu_count; i++) {
-    int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
-
-    Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
-    uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
-    void *zeros = nullptr;
-    if (ms_noise_reduction_key != nullptr)
-      zeros = ms_noise_reduction_key->ptr[i];
-
-    cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
-        streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
-        current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
-        zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
-        base_log, level_count, num_inputs_on_gpu);
-  }
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -38,6 +38,19 @@ void multi_gpu_copy_array_async(cudaStream_t const *streams,
        gpu_indexes[i], gpu_memory_allocated);
  }
 }
+/// Copy an array residing on one CPU to all active gpus
+template <typename Torus>
+void multi_gpu_copy_array_from_cpu_async(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, std::vector<Torus *> &dest, Torus const *h_src,
+    uint32_t elements_per_gpu, bool gpu_memory_allocated) {
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
+        gpu_indexes[i], gpu_memory_allocated);
+  }
+}
 /// Allocates the input/output vector for all devices
 /// Initializes also the related indexing and initializes it to the trivial
 /// index
@@ -93,6 +106,35 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
  }
 }

+// This function reads lwes using the indexes and place them in a single aligned
+// array. This function is needed before communication to perform a single
+// contiguous data movement. Each block handles one lwe.
+template <typename Torus>
+__global__ void align_with_indexes(Torus *d_packed_vector,
+                                   Torus const *d_vector,
+                                   Torus const *d_indexes, int lwe_size) {
+
+  int output_offset = blockIdx.x * lwe_size;
+  int input_offset = d_indexes[blockIdx.x] * lwe_size;
+  for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
+    d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
+  }
+}
+
+// This function takes the aligned array after communication and places it in
+// the corresponding indexes. Each block handles one lwe.
+template <typename Torus>
+__global__ void realign_with_indexes(Torus *d_vector,
+                                     Torus const *d_packed_vector,
+                                     Torus const *d_indexes, int lwe_size) {
+
+  int input_offset = blockIdx.x * lwe_size;
+  int output_offset = d_indexes[blockIdx.x] * lwe_size;
+  for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
+    d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
+  }
+}
+
 /// Load an array residing on one GPU to all active gpus
 /// and split the array among them.
 /// The input indexing logic is given by an index array.
@@ -102,15 +144,15 @@ template <typename Torus>
 void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, std::vector<Torus *> &dest,
-                                 Torus const *src, Torus const *h_src_indexes,
+                                 Torus const *src, Torus const *d_src_indexes,
                                 bool is_trivial_index,
+                                 std::vector<Torus *> &aligned_vec,
                                 uint32_t max_active_gpu_count,
                                 uint32_t num_inputs, uint32_t lwe_size) {

  if (max_active_gpu_count < gpu_count)
    PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
          "used to create the lut")
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -127,18 +169,28 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
          gpu_indexes[i], true);

    } else {
-      if (h_src_indexes == nullptr)
+      if (aligned_vec.size() == 0)
+        PANIC("Cuda error: auxiliary arrays should be setup!");
+
+      if (d_src_indexes == nullptr)
        PANIC("Cuda error: source indexes should be initialized!");
-      auto src_indexes = h_src_indexes + gpu_offset;

-      for (uint j = 0; j < inputs_on_gpu; j++) {
-        auto d_dest = dest[i] + j * lwe_size;
-        auto d_src = src + src_indexes[j] * lwe_size;
+      cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
+      cuda_set_device(gpu_indexes[0]);
+      align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
+          aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
+          lwe_size);
+      check_cuda_error(cudaGetLastError());
+      cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
+      cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);

-        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
-            true);
-      }
+      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+          dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
+          streams[i], gpu_indexes[i], true);
+
+      cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
+      cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
+      cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
    }
  }
 }
@@ -150,7 +202,8 @@ template <typename Torus>
 void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
                                Torus *dest, const std::vector<Torus *> &src,
-                                Torus *h_dest_indexes, bool is_trivial_index,
+                                Torus *d_dest_indexes, bool is_trivial_index,
+                                std::vector<Torus *> &aligned_vec,
                                uint32_t num_inputs, uint32_t lwe_size) {

  for (uint i = 0; i < gpu_count; i++) {
@@ -168,19 +221,27 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
          d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
          gpu_indexes[i], true);
    } else {
-      if (h_dest_indexes == nullptr)
+      if (aligned_vec.size() == 0)
+        PANIC("Cuda error: auxiliary arrays should be setup!");
+      if (d_dest_indexes == nullptr)
        PANIC("Cuda error: destination indexes should be initialized!");

-      auto dest_indexes = h_dest_indexes + gpu_offset;
+      cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);

-      for (uint j = 0; j < inputs_on_gpu; j++) {
-        auto d_dest = dest + dest_indexes[j] * lwe_size;
-        auto d_src = src[i] + j * lwe_size;
+      cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
+      cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);

-        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
-            true);
-      }
+      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+          aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
+          streams[i], gpu_indexes[i], true);
+
+      cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
+      cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
+      cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
+      cuda_set_device(gpu_indexes[0]);
+      realign_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
+          dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
+      check_cuda_error(cudaGetLastError());
    }
  }
 }
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -289,7 +289,6 @@ unsafe extern "C" {
        message_modulus: u32,
        carry_modulus: u32,
        allocate_gpu_memory: bool,
-        allocate_ms_array: bool,
    ) -> u64;
 }
 unsafe extern "C" {
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -88,6 +88,8 @@ extern "C" {

    pub fn cuda_get_number_of_gpus() -> i32;

+    pub fn cuda_get_number_of_sms() -> i32;
+
    pub fn cuda_synchronize_device(gpu_index: u32);

    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -15,7 +15,7 @@

 [rtl]
  bpip_use = true
-  bpip_use_opportunism = true
+  bpip_use_opportunism = false
  bpip_timeout = 100_000

 [board]
@@ -35,13 +35,21 @@

  bsk_pc = [
    {Hbm={pc=8}},
+    {Hbm={pc=10}},
    {Hbm={pc=12}},
+    {Hbm={pc=14}},
    {Hbm={pc=24}},
+    {Hbm={pc=26}},
    {Hbm={pc=28}},
+    {Hbm={pc=30}},
    {Hbm={pc=40}},
+    {Hbm={pc=42}},
    {Hbm={pc=44}},
+    {Hbm={pc=46}},
    {Hbm={pc=56}},
-    {Hbm={pc=60}}
+    {Hbm={pc=58}},
+    {Hbm={pc=60}},
+    {Hbm={pc=62}}
  ]

  ksk_pc = [
@@ -70,7 +78,7 @@
  #implementation = "Ilp"
  implementation = "Llt"
  integer_w=[2,4,6,8,10,12,14,16,32,64,128]
-  min_batch_size = 11
+  min_batch_size = 9
  kogge_cfg            = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
  custom_iop.'IOP[0]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
  custom_iop.'IOP[1]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
@@ -87,8 +95,8 @@
  custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"

 [firmware.op_cfg.default]
-  fill_batch_fifo = true
-  min_batch_size = false
+  fill_batch_fifo = false
+  min_batch_size = true
  use_tiers = false
  flush_behaviour = "Patient"
  flush = true
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
@@ -49,3 +49,15 @@ offset= 0x10
  read_access="Read"
  write_access="Write"
  duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
+
+[section.hpu_reset]
+description="Used to control the HPU soft reset"
+offset= 0x100
+
+[section.hpu_reset.register.trigger]
+  description="A soft reset for the whole HPU reconfigurable logic"
+  owner="Kernel"
+  read_access="Read"
+  write_access="WriteNotify"
+  field.request = { size_b=1, offset_b=0 ,  default={Cst=0}, description="request"}
+  field.done    = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
--- a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
+++ b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d1afb554756df4d8b39bee33ded2dda19c23a6f9d8e2b242092efd35cf1cc19
-size 83281321
+oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
+size 84230351
--- a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
+++ b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
@@ -31,14 +31,22 @@ class LD(BaseInstruction):
        self.__dict__ = d

    def args(self):
-        return f'R{self.rid} @{hex(self.slot["Addr"])}'
+        try:
+            return f'R{self.rid} @{hex(self.slot["Addr"])}'
+        except:
+            # It can happen that an IOP is not translated by the FW
+            return f'R{self.rid} @{self.slot}'

 class ST(BaseInstruction):
    def __init__(self, d):
        self.__dict__ = d

    def args(self):
-        return f'@{hex(self.slot["Addr"])} R{self.rid}'
+        try:
+            return f'@{hex(self.slot["Addr"])} R{self.rid}'
+        except:
+            # It can happen that an IOP is not translated by the FW
+            return f'@{self.slot} R{self.rid}'

 class MAC(BaseInstruction):
    def __init__(self, d):
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -176,6 +176,18 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
    imm: 1,
 };

+pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
+pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
+    dst: [VarMode::Native; SIMD_N],
+    src: [VarMode::Native; 2 * SIMD_N],
+    imm: 0,
+};
+pub const IOP_2NCT_F_3NCT: ConstIOpProto<{ 2 * SIMD_N }, { 3 * SIMD_N }> = ConstIOpProto {
+    dst: [VarMode::Native; 2 * SIMD_N],
+    src: [VarMode::Native; 3 * SIMD_N],
+    imm: 0,
+};
+
 use crate::iop;
 use arg::IOpFormat;
 use lazy_static::lazy_static;
@@ -227,4 +239,6 @@ iop!(
    [IOP_CT_F_CT -> "LEAD1", opcode::LEAD1],
    [IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
    [IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
+    [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
+    [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
 );
--- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
@@ -87,6 +87,10 @@ pub const LEAD1: u8 = 0x85;
 pub const TRAIL0: u8 = 0x86;
 pub const TRAIL1: u8 = 0x87;

+// SIMD for maximum throughput
+pub const ADD_SIMD: u8 = 0xF0;
+pub const ERC_20_SIMD: u8 = 0xF1;
+//
 // Utility operations
 // Used to handle real clone of ciphertext already uploaded in the Hpu memory
 pub const MEMCPY: u8 = 0xFF;
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
@@ -72,6 +72,9 @@ crate::impl_fw!("Ilp" [
    LEAD1  => fw_impl::ilp_log::iop_lead1;
    TRAIL0 => fw_impl::ilp_log::iop_trail0;
    TRAIL1 => fw_impl::ilp_log::iop_trail1;
+    // SIMD Implementations
+    ADD_SIMD     => fw_impl::llt::iop_add_simd;
+    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
 ]);

 #[instrument(level = "trace", skip(prog))]
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -57,16 +57,16 @@ crate::impl_fw!("Llt" [
    OVF_SSUB => fw_impl::ilp::iop_overflow_ssub;
    OVF_MULS => fw_impl::ilp::iop_overflow_muls;

-    BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
-    BW_OR  => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
-    BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
+    BW_AND       => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
+    BW_OR        => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
+    BW_XOR       => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});

-    CMP_GT  => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
-    CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
-    CMP_LT  => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
-    CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
-    CMP_EQ  => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
-    CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
+    CMP_GT       => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
+    CMP_GTE      => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
+    CMP_LT       => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
+    CMP_LTE      => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
+    CMP_EQ       => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
+    CMP_NEQ      => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});

    IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
    IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
@@ -81,6 +81,10 @@ crate::impl_fw!("Llt" [
    LEAD1 => fw_impl::ilp_log::iop_lead1;
    TRAIL0 => fw_impl::ilp_log::iop_trail0;
    TRAIL1 => fw_impl::ilp_log::iop_trail1;
+
+    // SIMD Implementations
+    ADD_SIMD     => fw_impl::llt::iop_add_simd;
+    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
 ]);

 // ----------------------------------------------------------------------------
@@ -102,6 +106,17 @@ pub fn iop_add(prog: &mut Program) {
    iop_addx(prog, dst, src_a, src_b);
 }

+#[instrument(level = "trace", skip(prog))]
+pub fn iop_add_simd(prog: &mut Program) {
+    // Add Comment header
+    prog.push_comment("ADD_SIMD Operand::Dst Operand::Src Operand::Src".to_string());
+    simd(
+        prog,
+        crate::asm::iop::SIMD_N,
+        fw_impl::llt::iop_add_ripple_rtl,
+    );
+}
+
 pub fn iop_adds(prog: &mut Program) {
    // Allocate metavariables:
    // Dest -> Operand
@@ -189,7 +204,7 @@ pub fn iop_mul(prog: &mut Program) {

    // Add Comment header
    prog.push_comment("MUL Operand::Dst Operand::Src Operand::Src".to_string());
-    // Deferred implementation to generic mulx function
+
    iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
 }

@@ -205,29 +220,50 @@ pub fn iop_muls(prog: &mut Program) {

    // Add Comment header
    prog.push_comment("MULS Operand::Dst Operand::Src Operand::Immediat".to_string());
-    // Deferred implementation to generic mulx function
+
    iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
 }

+#[instrument(level = "trace", skip(prog))]
+pub fn iop_erc_20(prog: &mut Program) {
+    // Add Comment header
+    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
+    iop_erc_20_rtl(prog, 0).add_to_prog(prog);
+}
+
+#[instrument(level = "trace", skip(prog))]
+pub fn iop_erc_20_simd(prog: &mut Program) {
+    // Add Comment header
+    prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
+    simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl);
+}
+
+// ----------------------------------------------------------------------------
+// Helper Functions
+// ----------------------------------------------------------------------------
+
 /// Implement erc_20 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
 /// 3. Compute new amount (from - new_amount, to + new_amount)
+///
+/// The input operands are:
+///     (from[0], to[0], amount[0], ..., from[N-1], to[N-1], amount[N-1])
+/// The output operands are:
+///     (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
+/// Where N is the batch size
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
+pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
    // Allocate metavariables:
    // Dest -> Operand
-    let dst_from = prog.iop_template_var(OperandKind::Dst, 0);
-    let dst_to = prog.iop_template_var(OperandKind::Dst, 1);
+    let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
+    let dst_to = prog.iop_template_var(OperandKind::Dst, 2 * batch_index + 1);
    // Src -> Operand
-    let src_from = prog.iop_template_var(OperandKind::Src, 0);
-    let src_to = prog.iop_template_var(OperandKind::Src, 1);
+    let src_from = prog.iop_template_var(OperandKind::Src, 3 * batch_index);
+    let src_to = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 1);
    // Src Amount -> Operand
-    let src_amount = prog.iop_template_var(OperandKind::Src, 2);
-
-    // Add Comment header
-    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
+    let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2);

    // TODO: Make this a parameter or sweep this
    // All these little parameters would be very handy to write an
@@ -236,7 +272,7 @@ pub fn iop_erc_20(prog: &mut Program) {
    let kogge_blk_w = 10;
    let ripple = true;

-    let tree = {
+    {
        let props = prog.params();
        let tfhe_params: asm::DigitParameters = props.clone().into();
        let lut = pbs_by_name!("IfFalseZeroed");
@@ -273,13 +309,26 @@ pub fn iop_erc_20(prog: &mut Program) {
            kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w)
                + kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w)
        }
-    };
-    tree.add_to_prog(prog);
+    }
+}
+
+/// A SIMD implementation of add for maximum throughput
+#[instrument(level = "trace", skip(prog))]
+pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl {
+    // Allocate metavariables:
+    let dst = prog.iop_template_var(OperandKind::Dst, i);
+    let src_a = prog.iop_template_var(OperandKind::Src, 2 * i);
+    let src_b = prog.iop_template_var(OperandKind::Src, 2 * i + 1);
+
+    // Convert MetaVarCell in VarCell for Rtl analysis
+    let a = VarCell::from_vec(src_a);
+    let b = VarCell::from_vec(src_b);
+    let d = VarCell::from_vec(dst);
+
+    // Do a + b with the ripple carry adder
+    kogge::ripple_add(d, a, b, None)
 }

-// ----------------------------------------------------------------------------
-// Helper Functions
-// ----------------------------------------------------------------------------
 fn iop_addx(
    prog: &mut Program,
    dst: Vec<MetaVarCell>,
@@ -313,11 +362,181 @@ fn iop_subx(
    .add_to_prog(prog);
 }

+/// Generic mul operation for massively parallel HPUs
+#[instrument(level = "trace", skip(prog))]
+pub fn iop_mulx_par(
+    prog: &mut Program,
+    dst: Vec<metavar::MetaVarCell>,
+    src_a: Vec<metavar::MetaVarCell>,
+    src_b: Vec<metavar::MetaVarCell>,
+) -> Rtl {
+    let props = prog.params();
+    let tfhe_params: asm::DigitParameters = props.clone().into();
+    let blk_w = props.blk_w();
+
+    // Transform metavars into RTL vars
+    let mut dst = VarCell::from_vec(dst);
+    let src_a = VarCell::from_vec(src_a);
+    let src_b = VarCell::from_vec(src_b);
+    let max_deg = VarDeg {
+        deg: props.max_val(),
+        nu: props.nu,
+    };
+
+    let pbs_mul_lsb = pbs_by_name!("MultCarryMsgLsb");
+    let pbs_mul_msb = pbs_by_name!("MultCarryMsgMsb");
+    let max_carry = (props.max_msg() * props.max_msg()) >> props.msg_w;
+    let max_msg = props.max_msg();
+
+    let mut mul_map: HashMap<usize, Vec<VarCellDeg>> = HashMap::new();
+    itertools::iproduct!(0..blk_w, 0..blk_w).for_each(|(i, j)| {
+        let pp = src_a[i].mac(tfhe_params.msg_range(), &src_b[j]);
+        let lsb = pp.single_pbs(&pbs_mul_lsb);
+        let msb = pp.single_pbs(&pbs_mul_msb);
+        mul_map
+            .entry(i + j)
+            .or_default()
+            .push(VarCellDeg::new(max_msg, lsb));
+        mul_map
+            .entry(i + j + 1)
+            .or_default()
+            .push(VarCellDeg::new(max_carry, msb));
+    });
+
+    let mut pp: Vec<VecVarCellDeg> = (0..dst.len())
+        .map(|i| mul_map.remove(&i).unwrap().into())
+        .collect();
+
+    // Reduce dada tree like
+    while pp.iter().any(|x| x.len() > 1) {
+        trace!(
+            target: "llt::mul",
+            "pp length: {:?}",
+            pp.iter().map(|x| x.len()).collect::<Vec<_>>()
+        );
+        for c in (0..dst.len()).rev() {
+            let mut col_len = pp[c].len();
+            let mut reduced = Vec::new();
+            let mut chunks = pp[c].deg_chunks(&max_deg).peekable();
+            let max_col = if c == (dst.len() - 1) {
+                0
+            } else {
+                dst.len() - 1
+            };
+
+            while chunks.peek().is_some() && col_len > pp[max_col].len() {
+                let mut chunk = chunks.next().unwrap();
+                let chunk_len = chunk.len();
+                col_len -= chunk.len();
+
+                // sum the chunk
+                while chunk.len() > 1 {
+                    chunk = chunk
+                        .chunks(2)
+                        .map(|chunk| match chunk.len() {
+                            1 => chunk[0].clone(),
+                            2 => &chunk[0] + &chunk[1],
+                            _ => panic!("Invalid chunk size"),
+                        })
+                        .collect()
+                }
+
+                // And bootstrap if needed
+                let element = chunk
+                    .into_iter()
+                    .next()
+                    .map(|sum| {
+                        assert!(sum.deg.nu <= props.nu);
+                        if sum.deg == max_deg || chunk_len == 1 {
+                            let (data, carry) = sum.bootstrap(&props);
+                            if let (Some(carry), Some(elm)) = (carry, pp.get_mut(c + 1)) {
+                                elm.push(carry);
+                            }
+                            data
+                        } else {
+                            sum
+                        }
+                    })
+                    .unwrap();
+
+                reduced.push(element);
+            }
+
+            pp[c] = reduced
+                .into_iter()
+                .chain(chunks.flatten())
+                .collect::<Vec<_>>()
+                .into();
+        }
+    }
+
+    trace!(
+        target: "llt::mul",
+        "final pp: {:?}", pp
+    );
+
+    // Extract carry and message and do carry propagation
+    let mut a: Vec<Option<VarCell>> = (0..dst.len() + 1).map(|_| None).collect();
+    let mut b: Vec<Option<VarCell>> = (0..dst.len() + 1).map(|_| None).collect();
+
+    pp.into_iter().enumerate().for_each(|(i, pp)| {
+        assert!(pp.len() == 1);
+        let vardeg = pp.first().unwrap();
+        let (msg, carry) = vardeg.bootstrap(&props);
+        a[i] = Some(msg.var);
+        if let Some(carry) = carry {
+            b[i + 1] = Some(carry.var);
+        }
+    });
+
+    let cs: Vec<_> = a
+        .into_iter()
+        .take(dst.len())
+        .zip(b.into_iter())
+        .map(|(a, b)| match (a, b) {
+            (Some(a), Some(b)) => &a + &b,
+            (Some(a), None) => a,
+            (None, Some(b)) => b,
+            _ => panic!("Fix your code"),
+        })
+        .collect();
+
+    // Do fully parallel carry propagation
+    kogge::propagate_carry(prog, dst.as_mut_slice(), cs.as_slice(), &None);
+
+    Rtl::from(dst)
+}
+
+/// multiplier wrapper, to choose between parallel and serial implementations
+#[instrument(level = "trace", skip(prog))]
+pub fn iop_mulx(
+    prog: &mut Program,
+    dst: Vec<metavar::MetaVarCell>,
+    src_a: Vec<metavar::MetaVarCell>,
+    src_b: Vec<metavar::MetaVarCell>,
+) -> Rtl {
+    // When the batch size is enough to do a full stage in parallel, do parallel
+    // mul.
+    // Note: The break-even point might not be this one, but choosing the right
+    // point is uninportant since we'll leap imensely the number of batches from
+    // FPGA to ASIC.
+    let parallel = prog
+        .op_cfg()
+        .parallel
+        .unwrap_or_else(|| prog.params().pbs_batch_w >= dst.len());
+
+    if parallel {
+        iop_mulx_par(prog, dst, src_a, src_b)
+    } else {
+        iop_mulx_ser(prog, dst, src_a, src_b)
+    }
+}
+
 /// Generic mul operation
 /// One destination and two sources operation
 /// Source could be Operand or Immediat
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_mulx(
+pub fn iop_mulx_ser(
    prog: &mut Program,
    dst: Vec<metavar::MetaVarCell>,
    src_a: Vec<metavar::MetaVarCell>,
@@ -367,7 +586,10 @@ pub fn iop_mulx(
                    sum.var.single_pbs(&pbs_carry),
                ));
            }
-            VarCellDeg::new(props.max_msg(), sum.var.single_pbs(&pbs_msg))
+            VarCellDeg::new(
+                sum.deg.deg.min(props.max_msg()),
+                sum.var.single_pbs(&pbs_msg),
+            )
        };

        while to_sum.len() > 1 {
@@ -540,3 +762,23 @@ fn bw_inv(prog: &mut Program, b: Vec<VarCell>) -> Vec<VarCell> {
        })
        .collect::<Vec<_>>()
 }
+
+/// Creates a SIMD version of the closure
+/// Make sure that the closure is a PBS optimized version of the operation
+/// The closure receives as inputs the program and the batch index.
+/// How the ASM operands are actually organized is defined by the closure
+/// itself.
+///
+/// Maybe this should go into a SIMD firmware implementation... At some point we
+/// would need a mechanism to choose between implementations on the fly to make
+/// real good use of all of this.
+fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F)
+where
+    F: Fn(&mut Program, u8) -> Rtl,
+{
+    (0..batch_size)
+        .map(|i| i as u8)
+        .map(|i| rtl_closure(prog, i))
+        .sum::<Rtl>()
+        .add_to_prog(prog);
+}
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs
@@ -1,4 +1,6 @@
 use super::rtl::VarCell;
+use super::*;
+use crate::pbs_by_name;
 use tracing::trace;

 #[derive(Clone, Eq, Default, Debug)]
@@ -48,6 +50,48 @@ pub struct VarCellDeg {
    pub deg: VarDeg,
 }

+impl VarCellDeg {
+    pub fn bootstrap(&self, props: &FwParameters) -> (VarCellDeg, Option<VarCellDeg>) {
+        trace!(target: "vardeg::VarCellDeg::bootstrap", "bootstrap: {:?}", self);
+
+        let pbs_many_carry = pbs_by_name!("ManyCarryMsg");
+        let pbs_carry = pbs_by_name!("CarryInMsg");
+        let pbs_msg = pbs_by_name!("MsgOnly");
+
+        if self.deg.deg <= props.max_msg() {
+            match self.deg.nu {
+                1 => (self.clone(), None),
+                _ => (
+                    VarCellDeg::new(self.deg.deg, self.var.single_pbs(&pbs_msg)),
+                    None,
+                ),
+            }
+        // If we still have a bit available to do manyLUT
+        } else if self.deg.deg > props.max_msg() && self.deg.deg <= (props.max_val() >> 1) {
+            let mut pbs = self.var.pbs(&pbs_many_carry).into_iter();
+            (
+                VarCellDeg::new(props.max_msg().min(self.deg.deg), pbs.next().unwrap()),
+                Some(VarCellDeg::new(
+                    self.deg.deg >> props.carry_w,
+                    pbs.next().unwrap(),
+                )),
+            )
+        //Otherwise, we'll have to use two independent PBSs
+        } else {
+            (
+                VarCellDeg::new(
+                    self.deg.deg.min(props.max_msg()),
+                    self.var.single_pbs(&pbs_msg),
+                ),
+                Some(VarCellDeg::new(
+                    self.deg.deg >> props.carry_w,
+                    self.var.single_pbs(&pbs_carry),
+                )),
+            )
+        }
+    }
+}
+
 impl PartialOrd for VarCellDeg {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
@@ -105,25 +149,23 @@ impl std::fmt::Debug for VarCellDeg {
 }

 impl VecVarCellDeg {
-    pub fn deg_chunks(
-        mut self,
-        max_deg: &VarDeg,
-    ) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
+    pub fn deg_chunks(&self, max_deg: &VarDeg) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
        trace!(target: "llt:deg_chunks", "len: {:?}, {:?}", self.len(), self.0);

        let mut res: Vec<Vec<VarCellDeg>> = Vec::new();
        let mut acc: VarDeg = VarDeg::default();
        let mut chunk: Vec<VarCellDeg> = Vec::new();
+        let mut copy = self.0.clone();

        // There are many ways to combine the whole vector in chunks up to
        // max_deg. We'll be greedy and sum up the elements by maximum degree
        // first.
-        self.0.sort();
+        copy.sort();

-        while !self.is_empty() {
-            let sum = &acc + &self.0.last().unwrap().deg;
+        while !copy.is_empty() {
+            let sum = &acc + &copy.last().unwrap().deg;
            if sum <= *max_deg {
-                chunk.push(self.0.pop().unwrap());
+                chunk.push(copy.pop().unwrap());
                acc = sum;
            } else {
                res.push(chunk);
@@ -131,7 +173,7 @@ impl VecVarCellDeg {
                chunk = Vec::new();
            }
            trace!(target: "llt:deg_chunks:loop", "len: {:?}, {:?}, chunk: {:?}, acc: {:?}",
-                self.len(), self.0, chunk, acc);
+                self.len(), copy, chunk, acc);
        }

        // Any remaining chunk is appended
@@ -159,4 +201,8 @@ impl VecVarCellDeg {
    pub fn is_empty(&self) -> bool {
        self.0.len() == 0
    }
+
+    pub fn push(&mut self, item: VarCellDeg) {
+        self.0.push(item)
+    }
 }
--- a/backends/tfhe-hpu-backend/src/fw/rtl/config.rs
+++ b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs
@@ -12,6 +12,8 @@ pub struct OpCfg {
    pub flush: bool,
    /// Whether to use latency tiers when scheduling
    pub use_tiers: bool,
+    /// Whether to use a massively parallel implementation
+    pub parallel: Option<bool>,
 }

 #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
--- a/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs
@@ -1500,7 +1500,7 @@ impl Arch {
    }
 }

-#[derive(Clone, Debug)]
+#[derive(Default, Clone, Debug)]
 pub struct Rtl(Vec<VarCell>);

 impl Rtl {
@@ -1623,6 +1623,12 @@ impl std::ops::Add<Rtl> for Rtl {
    }
 }

+impl std::iter::Sum<Rtl> for Rtl {
+    fn sum<I: Iterator<Item = Rtl>>(iter: I) -> Self {
+        iter.fold(Rtl::default(), |acc, x| acc + x)
+    }
+}
+
 impl Drop for Rtl {
    fn drop(&mut self) {
        self.unload();
--- a/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs
+++ b/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs
@@ -354,7 +354,7 @@ impl InfoPePbs {
    }

    pub fn update_load_bsk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) {
-        (1..16).for_each(|i| {
+        (0..16).for_each(|i| {
            let reg_name = format!("runtime_3in3::pep_load_bsk_rcp_dur_pc{i}");
            let reg = regmap
                .register()
@@ -364,7 +364,7 @@ impl InfoPePbs {
        });
    }
    pub fn update_load_ksk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) {
-        (1..16).for_each(|i| {
+        (0..16).for_each(|i| {
            let reg_name = format!("runtime_1in3::pep_load_ksk_rcp_dur_pc{i}");
            let reg = regmap
                .register()
--- a/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs
+++ b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs
@@ -64,13 +64,13 @@ pub struct IscPoolState {
    pub(super) vld: bool,
    pub(super) wr_lock: u32,
    pub(super) rd_lock: u32,
-    //pub(super) issue_lock: u32,
+    pub(super) issue_lock: u32,
    pub(super) sync_id: u32,
 }

 impl Len for IscPoolState {
    fn len() -> usize {
-        21
+        28
    }
 }

@@ -85,8 +85,8 @@ where
            vld: *(slice.get(2).ok_or(NoMoreBits)?),
            wr_lock: slice.get(3..10).ok_or(NoMoreBits)?.load::<u32>(),
            rd_lock: slice.get(10..17).ok_or(NoMoreBits)?.load::<u32>(),
-            //issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::<u32>(),
-            sync_id: slice.get(17..21).ok_or(NoMoreBits)?.load::<u32>(),
+            issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::<u32>(),
+            sync_id: slice.get(24..28).ok_or(NoMoreBits)?.load::<u32>(),
        })
    }
 }
--- a/backends/tfhe-hpu-backend/src/utils/hputil.rs
+++ b/backends/tfhe-hpu-backend/src/utils/hputil.rs
@@ -79,6 +79,9 @@ pub enum Command {
        #[arg(short, long, default_value_t = String::from("trace.json"))]
        file: String,
    },
+
+    #[clap(about = "Resets all HPU processing logic")]
+    SoftReset {},
 }

 #[derive(Clone, Debug, ValueEnum)]
@@ -286,5 +289,26 @@ fn main() {
            serde_json::to_writer_pretty(file.make_writer(), &parsed)
                .expect("Could not write trace dump");
        }
+        Command::SoftReset {} => {
+            let soft_reset = regmap
+                .register()
+                .get("hpu_reset::trigger")
+                .expect("The current HPU does not support soft reset.");
+            let soft_reset_addr = *soft_reset.offset() as u64;
+
+            for reset in [true, false].into_iter() {
+                hpu_hw.write_reg(soft_reset_addr, reset as u32);
+                loop {
+                    let done = {
+                        let val = hpu_hw.read_reg(soft_reset_addr);
+                        let fields = soft_reset.as_field(val);
+                        *fields.get("done").expect("Unknown field") != 0
+                    };
+                    if done == reset {
+                        break;
+                    }
+                }
+            }
+        }
    }
 }
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -12,5 +12,9 @@
  "n3-H100x4": 6.08,
  "n3-H100x2": 3.04,
  "n3-L40x1": 0.80,
-  "n3-H100x8-SXM5": 19.2
+  "n3-H100x8-SXM5": 19.2,
+  "hpu_x1": 1.0,
+  "hpu_x2": 1.4,
+  "hpu_x4": 2.3,
+  "hpu_x8": 4.0
 }
--- a/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail128_psi64.toml
+++ b/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail128_psi64.toml
@@ -36,7 +36,7 @@
 [pc_params]
    ksk_pc= 16
    ksk_bytes_w= 32
-    bsk_pc= 8
+    bsk_pc= 16
    bsk_bytes_w= 32
    pem_pc= 2
    pem_bytes_w= 32
--- a/setup_hpu.sh
+++ b/setup_hpu.sh
@@ -23,7 +23,7 @@ V80_PCIE_DEV="unselected"
 XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}

 # V80 bitstream refresh require insmod of ami.ko module
-AMI_PATH=${AMI_PATH:-"/opt/v80/ami/1e6a8da"}
+AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}

 # Parse user CLI ##############################################################
 opt_short="hc:l:p:"
@@ -63,6 +63,9 @@ do
            if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then
                V80_PCIE_DEV="${2}"
                ((i++))
+                shift 1
+            elif [[ ${#DEVICE[@]} -eq 1 ]]; then
+                V80_PCIE_DEV=${DEVICE[0]%%:*}
            else
                echo "Please select a device in following list (1st two digits):"
                for item in "${DEVICE[@]}"; do
@@ -70,7 +73,7 @@ do
                done
                return 1
            fi
-            shift 2
+            shift 1
            ;;
        "") # End of input reading
            break ;;
--- a/tfhe-benchmark/benches/high_level_api/bench.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench.rs
@@ -29,9 +29,15 @@ fn bench_fhe_type<FheType>(
        + RotateRight<&'a FheType, Output = FheType>
        + OverflowingAdd<&'a FheType, Output = FheType>
        + OverflowingSub<&'a FheType, Output = FheType>,
+    for<'a> FheType: FheMin<&'a FheType, Output = FheType> + FheMax<&'a FheType, Output = FheType>,
 {
    let mut bench_group = c.benchmark_group(type_name);
-    let bench_prefix = "hlapi::ops";
+    let mut bench_prefix = "hlapi::ops".to_string();
+    if cfg!(feature = "gpu") {
+        bench_prefix = format!("{}::cuda", bench_prefix);
+    } else if cfg!(feature = "hpu") {
+        bench_prefix = format!("{}::hpu", bench_prefix);
+    }

    let mut rng = thread_rng();

@@ -66,31 +72,25 @@ fn bench_fhe_type<FheType>(
    });
    write_record(bench_id, "add");

-    #[cfg(not(feature = "hpu"))]
-    {
-        bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
-        bench_group.bench_function(&bench_id, |b| {
-            b.iter(|| {
-                let (res, flag) = lhs.overflowing_add(&rhs);
-                res.wait();
-                black_box((res, flag))
-            })
-        });
-        write_record(bench_id, "overflowing_add");
-    }
+    bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let (res, flag) = lhs.overflowing_add(&rhs);
+            res.wait();
+            black_box((res, flag))
+        })
+    });
+    write_record(bench_id, "overflowing_add");

-    #[cfg(not(feature = "hpu"))]
-    {
-        bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
-        bench_group.bench_function(&bench_id, |b| {
-            b.iter(|| {
-                let (res, flag) = lhs.overflowing_sub(&rhs);
-                res.wait();
-                black_box((res, flag))
-            })
-        });
-        write_record(bench_id, "overflowing_sub");
-    }
+    bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let (res, flag) = lhs.overflowing_sub(&rhs);
+            res.wait();
+            black_box((res, flag))
+        })
+    });
+    write_record(bench_id, "overflowing_sub");

    bench_id = format!("{bench_prefix}::sub::{param_name}::{bit_size}_bits");
    bench_group.bench_function(&bench_id, |b| {
@@ -142,48 +142,65 @@ fn bench_fhe_type<FheType>(
    });
    write_record(bench_id, "bitxor");

-    #[cfg(not(feature = "hpu"))]
-    {
-        bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
-        bench_group.bench_function(&bench_id, |b| {
-            b.iter(|| {
-                let res = &lhs << &rhs;
-                res.wait();
-                black_box(res)
-            })
-        });
-        write_record(bench_id, "left_shift");
+    bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = &lhs << &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, "left_shift");

-        bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
-        bench_group.bench_function(&bench_id, |b| {
-            b.iter(|| {
-                let res = &lhs >> &rhs;
-                res.wait();
-                black_box(res)
-            })
-        });
-        write_record(bench_id, "right_shift");
+    bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = &lhs >> &rhs;
+            res.wait();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, "right_shift");

-        bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
-        bench_group.bench_function(&bench_id, |b| {
-            b.iter(|| {
-                let res = (&lhs).rotate_left(&rhs);
-                res.wait();
-                black_box(res)
-            })
-        });
-        write_record(bench_id, "left_rotate");
+    bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = (&lhs).rotate_left(&rhs);
+            res.wait();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, "left_rotate");

-        bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
-        bench_group.bench_function(&bench_id, |b| {
-            b.iter(|| {
-                let res = (&lhs).rotate_right(&rhs);
-                res.wait();
-                black_box(res)
-            })
-        });
-        write_record(bench_id, "right_rotate");
-    }
+    bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = (&lhs).rotate_right(&rhs);
+            res.wait();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, "right_rotate");
+
+    bench_id = format!("{bench_prefix}::min::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = lhs.min(&rhs);
+            res.wait();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, "min");
+
+    bench_id = format!("{bench_prefix}::max::{param_name}::{bit_size}_bits");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = lhs.max(&rhs);
+            res.wait();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, "max");
 }

 macro_rules! bench_type {
--- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -1,9 +1,16 @@
 #[cfg(feature = "gpu")]
-use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use benchmark::params_aliases::{
+    BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
 #[cfg(not(feature = "gpu"))]
-use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-#[cfg(feature = "gpu")]
-use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use benchmark::params_aliases::{
+    BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
 #[cfg(feature = "gpu")]
 use benchmark::utilities::configure_gpu;
 use benchmark::utilities::{
@@ -21,11 +28,13 @@ use tfhe::core_crypto::gpu::get_number_of_gpus;
 #[cfg(feature = "gpu")]
 use tfhe::{set_server_key, GpuIndex};
 use tfhe::{
-    ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
-    FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
+    ClientKey, CompressedCiphertextListBuilder, CompressedServerKey,
+    CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14,
+    FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, HlCompressible,
+    HlExpandable, HlSquashedNoiseCompressible,
 };

-fn bench_fhe_type<FheType>(
+fn bench_sns_only_fhe_type<FheType>(
    c: &mut Criterion,
    client_key: &ClientKey,
    type_name: &str,
@@ -139,27 +148,184 @@ fn bench_fhe_type<FheType>(
    );
 }

-macro_rules! bench_type {
+fn bench_decomp_sns_comp_fhe_type<FheType>(
+    c: &mut Criterion,
+    client_key: &ClientKey,
+    type_name: &str,
+    num_bits: usize,
+) where
+    FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
+    FheType: SquashNoise + Tagged + HlExpandable + HlCompressible,
+    <FheType as SquashNoise>::Output: HlSquashedNoiseCompressible,
+{
+    let mut bench_group = c.benchmark_group(type_name);
+    let bench_id_prefix = if cfg!(feature = "gpu") {
+        "hlapi::cuda"
+    } else {
+        "hlapi"
+    };
+    let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}");
+
+    let mut rng = thread_rng();
+
+    let bench_id;
+
+    match get_bench_type() {
+        BenchmarkType::Latency => {
+            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
+
+            #[cfg(feature = "gpu")]
+            configure_gpu(client_key);
+
+            let input = FheType::encrypt(rng.gen(), client_key);
+
+            let mut builder = CompressedCiphertextListBuilder::new();
+            builder.push(input);
+            let compressed = builder.build().unwrap();
+
+            bench_group.bench_function(&bench_id, |b| {
+                b.iter(|| {
+                    let decompressed = compressed.get::<FheType>(0).unwrap().unwrap();
+                    let squashed = decompressed.squash_noise().unwrap();
+                    let mut builder = CompressedSquashedNoiseCiphertextListBuilder::new();
+                    builder.push(squashed);
+                    let _ = builder.build();
+                })
+            });
+        }
+        BenchmarkType::Throughput => {
+            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
+            let params = client_key.computation_parameters();
+            let num_blocks = num_bits
+                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
+
+            #[cfg(feature = "gpu")]
+            {
+                let elements = throughput_num_threads(num_blocks, 4);
+                bench_group.throughput(Throughput::Elements(elements));
+                println!("elements: {elements}");
+                let gpu_count = get_number_of_gpus() as usize;
+                let compressed_server_key = CompressedServerKey::new(client_key);
+                let sks_vec = (0..gpu_count)
+                    .map(|i| {
+                        compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
+                    })
+                    .collect::<Vec<_>>();
+
+                bench_group.bench_function(&bench_id, |b| {
+                    let compressed_values = || {
+                        (0..elements)
+                            .map(|_| {
+                                let input = FheType::encrypt(rng.gen(), client_key);
+                                let mut builder = CompressedCiphertextListBuilder::new();
+                                builder.push(input);
+                                builder.build().unwrap()
+                            })
+                            .collect::<Vec<_>>()
+                    };
+
+                    b.iter_batched(
+                        compressed_values,
+                        |compressed_inputs| {
+                            compressed_inputs
+                                .par_iter()
+                                .enumerate()
+                                .for_each(|(i, input)| {
+                                    set_server_key(sks_vec[i % gpu_count].clone());
+
+                                    let decompressed = input.get::<FheType>(0).unwrap().unwrap();
+                                    let squashed = decompressed.squash_noise().unwrap();
+                                    let mut builder =
+                                        CompressedSquashedNoiseCiphertextListBuilder::new();
+                                    builder.push(squashed);
+                                    let _ = builder.build();
+                                })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+
+            #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+            {
+                let elements = throughput_num_threads(num_blocks, 1);
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let compressed_values = || {
+                        (0..elements)
+                            .map(|_| {
+                                let input = FheType::encrypt(rng.gen(), client_key);
+                                let mut builder = CompressedCiphertextListBuilder::new();
+                                builder.push(input);
+                                builder.build().unwrap()
+                            })
+                            .collect::<Vec<_>>()
+                    };
+
+                    b.iter_batched(
+                        compressed_values,
+                        |compressed_inputs| {
+                            compressed_inputs.par_iter().for_each(|input| {
+                                let decompressed = input.get::<FheType>(0).unwrap().unwrap();
+                                let squashed = decompressed.squash_noise().unwrap();
+                                let mut builder =
+                                    CompressedSquashedNoiseCiphertextListBuilder::new();
+                                builder.push(squashed);
+                                let _ = builder.build();
+                            })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+        }
+    }
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "decomp_noise_squash_comp",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+macro_rules! bench_sns_only_type {
    ($fhe_type:ident) => {
        ::paste::paste! {
-            fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
-                bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
+                bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
            }
        }
    };
 }

-bench_type!(FheUint2);
-bench_type!(FheUint4);
-bench_type!(FheUint6);
-bench_type!(FheUint8);
-bench_type!(FheUint10);
-bench_type!(FheUint12);
-bench_type!(FheUint14);
-bench_type!(FheUint16);
-bench_type!(FheUint32);
-bench_type!(FheUint64);
-bench_type!(FheUint128);
+macro_rules! bench_decomp_sns_comp_type {
+    ($fhe_type:ident) => {
+        ::paste::paste! {
+            fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
+                bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            }
+        }
+    };
+}
+
+bench_sns_only_type!(FheUint2);
+bench_sns_only_type!(FheUint4);
+bench_sns_only_type!(FheUint6);
+bench_sns_only_type!(FheUint8);
+bench_sns_only_type!(FheUint10);
+bench_sns_only_type!(FheUint12);
+bench_sns_only_type!(FheUint14);
+bench_sns_only_type!(FheUint16);
+bench_sns_only_type!(FheUint32);
+bench_sns_only_type!(FheUint64);
+bench_sns_only_type!(FheUint128);
+
+bench_decomp_sns_comp_type!(FheUint64);

 fn main() {
    #[cfg(feature = "hpu")]
@@ -172,6 +338,10 @@ fn main() {
            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
        )
        .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
+        .enable_noise_squashing_compression(
+            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
        .build();
        let cks = ClientKey::generate(config);
        let compressed_sks = CompressedServerKey::new(&cks);
@@ -190,6 +360,12 @@ fn main() {
        .enable_noise_squashing(
            BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
        )
+        .enable_noise_squashing_compression(
+            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .enable_compression(
+            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
        .build();
        let cks = ClientKey::generate(config);
        let compressed_sks = CompressedServerKey::new(&cks);
@@ -200,17 +376,19 @@ fn main() {

    let mut c = Criterion::default().configure_from_args();

-    bench_fhe_uint2(&mut c, &cks);
-    bench_fhe_uint4(&mut c, &cks);
-    bench_fhe_uint6(&mut c, &cks);
-    bench_fhe_uint8(&mut c, &cks);
-    bench_fhe_uint10(&mut c, &cks);
-    bench_fhe_uint12(&mut c, &cks);
-    bench_fhe_uint14(&mut c, &cks);
-    bench_fhe_uint16(&mut c, &cks);
-    bench_fhe_uint32(&mut c, &cks);
-    bench_fhe_uint64(&mut c, &cks);
-    bench_fhe_uint128(&mut c, &cks);
+    bench_sns_only_fhe_uint2(&mut c, &cks);
+    bench_sns_only_fhe_uint4(&mut c, &cks);
+    bench_sns_only_fhe_uint6(&mut c, &cks);
+    bench_sns_only_fhe_uint8(&mut c, &cks);
+    bench_sns_only_fhe_uint10(&mut c, &cks);
+    bench_sns_only_fhe_uint12(&mut c, &cks);
+    bench_sns_only_fhe_uint14(&mut c, &cks);
+    bench_sns_only_fhe_uint16(&mut c, &cks);
+    bench_sns_only_fhe_uint32(&mut c, &cks);
+    bench_sns_only_fhe_uint64(&mut c, &cks);
+    bench_sns_only_fhe_uint128(&mut c, &cks);
+
+    bench_decomp_sns_comp_fhe_uint64(&mut c, &cks);

    c.final_summary();
 }
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -677,6 +677,114 @@ fn if_then_else_parallelized(c: &mut Criterion) {
    bench_group.finish()
 }

+fn flip_parallelized(c: &mut Criterion) {
+    let bench_name = "integer::flip_parallelized";
+    let display_name = "flip";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));
+    let mut rng = rand::thread_rng();
+
+    for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+        let param_name = param.name();
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let bench_data = LazyCell::new(|| {
+                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                    let clear_0 = gen_random_u256(&mut rng);
+                    let clear_1 = gen_random_u256(&mut rng);
+                    let clear_cond = rng.gen_bool(0.5);
+
+                    let true_ct = cks.encrypt_radix(clear_0, num_block);
+                    let false_ct = cks.encrypt_radix(clear_1, num_block);
+                    let condition = cks.encrypt_bool(clear_cond);
+
+                    (sks, condition, true_ct, false_ct)
+                });
+
+                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                bench_group.bench_function(&bench_id, |b| {
+                    let (sks, condition, true_ct, false_ct) =
+                        (&bench_data.0, &bench_data.1, &bench_data.2, &bench_data.3);
+
+                    b.iter(|| sks.flip_parallelized(condition, true_ct, false_ct))
+                });
+            }
+            BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_0 = gen_random_u256(&mut rng);
+                let true_ct = cks.encrypt_radix(clear_0, num_block);
+
+                let clear_1 = gen_random_u256(&mut rng);
+                let false_ct = cks.encrypt_radix(clear_1, num_block);
+
+                let condition = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
+
+                reset_pbs_count();
+                sks.flip_parallelized(&condition, &true_ct, &false_ct);
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                bench_group
+                    .sample_size(10)
+                    .measurement_time(std::time::Duration::from_secs(30));
+                let elements = throughput_num_threads(num_block, pbs_count);
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let cts_cond = (0..elements)
+                            .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
+                            .collect::<Vec<_>>();
+
+                        let cts_then = (0..elements)
+                            .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
+                            .collect::<Vec<_>>();
+                        let cts_else = (0..elements)
+                            .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
+                            .collect::<Vec<_>>();
+
+                        (cts_cond, cts_then, cts_else)
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(cts_cond, cts_then, cts_else)| {
+                            cts_cond
+                                .par_iter()
+                                .zip(cts_then.par_iter())
+                                .zip(cts_else.par_iter())
+                                .for_each(|((condition, true_ct), false_ct)| {
+                                    sks.flip_parallelized(condition, true_ct, false_ct);
+                                })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    );
+                });
+            }
+        }
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            param,
+            param.name(),
+            display_name,
+            &OperatorType::Atomic,
+            bit_size as u32,
+            vec![param.message_modulus().0.ilog2(); num_block],
+        );
+    }
+
+    bench_group.finish()
+}
+
 fn ciphertexts_sum_parallelized(c: &mut Criterion) {
    let bench_name = "integer::sum_ciphertexts_parallelized";
    let display_name = "sum_ctxts";
@@ -3032,7 +3140,7 @@ mod hpu {
        fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
            bench_hpu_iop_clean_inputs(
                c,
-                concat!("integer::hpu::scalar::", stringify!($iop)),
+                concat!("integer::hpu::scalar_", stringify!($iop)),
                stringify!($name),
                &hpu_asm::iop::[< IOP_ $iop:upper >],
            )
@@ -3350,6 +3458,7 @@ criterion_group!(
    gt_parallelized,
    ge_parallelized,
    if_then_else_parallelized,
+    flip_parallelized,
 );

 criterion_group!(
@@ -3365,6 +3474,7 @@ criterion_group!(
    eq_parallelized,
    gt_parallelized,
    if_then_else_parallelized,
+    flip_parallelized
 );

 criterion_group!(
@@ -3629,6 +3739,9 @@ fn go_through_hpu_bench_groups(val: &str) {
            hpu::default_hpu_bitwise();
            hpu::default_hpu_cmp();
            hpu::default_hpu_select();
+            hpu::default_hpu_shiftrot();
+            hpu::default_hpu_shiftrot_scalar();
+            hpu::default_hpu_bitcnt();
        }
        "fast_default" => {
            hpu::default_hpu_ops();
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
--- a/tfhe-benchmark/src/params_aliases.rs
+++ b/tfhe-benchmark/src/params_aliases.rs
@@ -145,7 +145,7 @@ pub mod shortint_params_aliases {

    pub const BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
        NoiseSquashingCompressionParameters =
-        V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

    #[cfg(feature = "hpu")]
    // KS PBS Gaussian for Hpu
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -3,7 +3,7 @@ use std::path::PathBuf;
 use std::sync::OnceLock;
 use std::{env, fs};
 #[cfg(feature = "gpu")]
-use tfhe::core_crypto::gpu::get_number_of_gpus;
+use tfhe::core_crypto::gpu::{get_number_of_gpus, get_number_of_sms};
 use tfhe::core_crypto::prelude::*;

 #[cfg(feature = "boolean")]
@@ -417,10 +417,6 @@ pub fn get_bench_type() -> &'static BenchmarkType {
    BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap())
 }

-/// Number of streaming multiprocessors (SM) available on Nvidia H100 GPU
-#[cfg(feature = "gpu")]
-const H100_PCIE_SM_COUNT: u32 = 114;
-
 /// Generate a number of threads to use to saturate current machine for throughput measurements.
 pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
    let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
@@ -431,11 +427,19 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {

    #[cfg(feature = "gpu")]
    {
-        let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
+        let num_sms_per_gpu = get_number_of_sms();
+        let total_num_sm = num_sms_per_gpu * get_number_of_gpus();
+
+        let total_blocks_per_sm = 4u32; // Assume each SM can handle 4 blocks concurrently
+        let total_num_sm = total_blocks_per_sm * total_num_sm;
+        let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
+        let elements_per_wave = total_num_sm as u64 / (num_block as u64);
+
        let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
        let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
-        elements.min(200) // This threshold is useful for operation with both a small number of
-                          // block and low PBs count.
+        elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation
+                                                        // with both a small number of
+                                                        // block and low PBs count.
    }
    #[cfg(feature = "hpu")]
    {
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -14,6 +14,7 @@ rust-version = "1.72"
 [dependencies]
 aes = "0.8.2"
 rayon = { workspace = true, optional = true }
+getrandom = { workspace = true }

 [target.'cfg(target_os = "macos")'.dependencies]
 libc = "0.2.133"
--- a/tfhe-csprng/src/seeders/implem/unix.rs
+++ b/tfhe-csprng/src/seeders/implem/unix.rs
@@ -1,12 +1,11 @@
 use crate::seeders::{Seed, Seeder};
-use std::fs::File;
-use std::io::Read;

-/// A seeder which uses the `/dev/random` source on unix-like systems.
+/// A seeder which uses the system entropy source on unix-like systems.
+///
+/// If available, this will use `getrandom` or `getentropy` system call. Otherwise it will draw from
+/// `/dev/urandom` after successfully polling `/dev/random`.
 pub struct UnixSeeder {
-    counter: u128,
    secret: u128,
-    file: File,
 }

 impl UnixSeeder {
@@ -15,35 +14,34 @@ impl UnixSeeder {
    /// Important:
    /// ----------
    ///
-    /// This secret is used to ensure the quality of the seed in scenarios where `/dev/random` may
-    /// be compromised.
+    /// This secret is used to ensure the quality of the seed in scenarios where the system random
+    /// source may be compromised.
    ///
    /// The attack hypotheses are as follow:
-    /// - `/dev/random` output can be predicted by a process running on the machine by just
+    /// - The kernel random output can be predicted by a process running on the machine by just
    ///   observing various states of the machine
    /// - The attacker cannot read data from the process where `tfhe-csprng` is running
    ///
    /// Using a secret in `tfhe-csprng` allows to generate values that the attacker cannot
-    /// predict, making this seeder secure on systems were `/dev/random` outputs can be
+    /// predict, making this seeder secure on systems were the kernel random outputs can be
    /// predicted.
    pub fn new(secret: u128) -> UnixSeeder {
-        let file = std::fs::File::open("/dev/random").expect("Failed to open /dev/random .");
-        let counter = std::time::UNIX_EPOCH
-            .elapsed()
-            .expect("Failed to initialize unix seeder.")
-            .as_nanos();
-        UnixSeeder {
-            secret,
-            counter,
-            file,
-        }
+        UnixSeeder { secret }
    }
 }

 impl Seeder for UnixSeeder {
+    /// Draws entropy from a system source to seed a CSPRNG.
+    ///
+    /// It may be blocking at system startup if the kernel entropy pool has not been initialized,
+    /// but should not be blocking after.
+    ///
+    /// # Panics
+    /// This may panic if the `getrandom` system call is not available and no file descriptor is
+    /// available on the system.
    fn seed(&mut self) -> Seed {
-        let output = self.secret ^ self.counter ^ dev_random(&mut self.file);
-        self.counter = self.counter.wrapping_add(1);
+        let output = self.secret ^ get_system_entropy();
+
        Seed(output)
    }

@@ -52,11 +50,14 @@ impl Seeder for UnixSeeder {
    }
 }

-fn dev_random(random: &mut File) -> u128 {
+fn get_system_entropy() -> u128 {
    let mut buf = [0u8; 16];
-    random
-        .read_exact(&mut buf[..])
-        .expect("Failed to read from /dev/random .");
+    // This will use the getrandom syscall if possible (from linux 3.17). This syscall is not
+    // vulnerable to fd exhaustion since it directly pulls from kernel entropy sources.
+    //
+    // This syscall will use the urandom entropy source but block at startup until it is correctly
+    // seeded. See <https://www.2uo.de/myths-about-urandom/> for a rational around random/urandom.
+    getrandom::getrandom(&mut buf).expect("Failed to read entropy from system");
    u128::from_ne_bytes(buf)
 }

--- a/tfhe-fft/Cargo.toml
+++ b/tfhe-fft/Cargo.toml
@@ -37,7 +37,7 @@ serde_json = "1.0.96"
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
 wasm-bindgen-test = "0.3"
 wasm-bindgen = { workspace = true }
-getrandom = { version = "0.2", features = ["js"] }
+getrandom = { workspace = true, features = ["js"] }

 [target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dev-dependencies]
 rug = "1.19.1"
--- a/tfhe-fft/src/dif16.rs
+++ b/tfhe-fft/src/dif16.rs
@@ -216,24 +216,6 @@ fn stockham_core_1x2<c64xN: Pod>(
        let oo = simd.mul(we, simd.add(a08m1a4c_pj_a2am1a6e, v8_a19m1a5d_pj_a3bm1a7f));
        let pp = simd.mul(wf, simd.add(s08pjs4c_pv_s2apjs6e, hf_s19pjs5d_pv_s3bpjs7f));

-        let ab = simd.catlo(aa, bb);
-        y[0] = ab;
-        let cd = simd.catlo(cc, dd);
-        y[1] = cd;
-        let ef = simd.catlo(ee, ff);
-        y[2] = ef;
-        let gh = simd.catlo(gg, hh);
-        y[3] = gh;
-
-        let ab = simd.cathi(aa, bb);
-        y[4] = ab;
-        let cd = simd.cathi(cc, dd);
-        y[5] = cd;
-        let ef = simd.cathi(ee, ff);
-        y[6] = ef;
-        let gh = simd.cathi(gg, hh);
-        y[7] = gh;
-
        let ab = simd.catlo(aa, bb);
        y[0x0] = ab;
        let cd = simd.catlo(cc, dd);
--- a/tfhe-zk-pok/src/four_squares.rs
+++ b/tfhe-zk-pok/src/four_squares.rs
@@ -1,6 +1,8 @@
 use ark_ff::biginteger::arithmetic::widening_mul;
 use rand::prelude::*;

+use crate::proofs::ProofSanityCheckMode;
+
 /// Avoid overflows for squares of u64
 pub fn sqr(x: u64) -> u128 {
    let x = x as u128;
@@ -188,21 +190,24 @@ impl Montgomery {
    }
 }

-pub fn four_squares(v: u128) -> [u64; 4] {
+pub fn four_squares(v: u128, sanity_check_mode: ProofSanityCheckMode) -> [u64; 4] {
    let rng = &mut StdRng::seed_from_u64(0);

-    // In the extreme case where the noise is exactly at the bound, v is 0
-    if v == 0 {
-        return [0; 4];
-    }
+    // Handle limit cases that would trigger an infinite loop
+    match v {
+        0 => return [0; 4],
+        2 => return [1, 1, 0, 0],
+        6 => return [2, 1, 1, 0],
+        _ => {}
+    };

    let f = v % 4;
    if f == 2 {
        let b = v.isqrt() as u64;

        'main_loop: loop {
-            let x = 2 + rng.gen::<u64>() % (b - 2);
-            let y = 2 + rng.gen::<u64>() % (b - 2);
+            let x: u64 = rng.gen_range(0..=b);
+            let y: u64 = rng.gen_range(0..=b);

            let (sum, o) = u128::overflowing_add(sqr(x), sqr(y));
            if o || sum > v {
@@ -270,21 +275,80 @@ pub fn four_squares(v: u128) -> [u64; 4] {
            return [x, y, z, w];
        }
    } else if f == 0 {
-        four_squares(v / 4).map(|x| x + x)
+        four_squares(v / 4, sanity_check_mode).map(|x| x + x)
    } else {
-        let mut r = four_squares(2 * v);
-        r.sort_by_key(|&x| {
-            if x % 2 == 0 {
-                -1 - ((x / 2) as i64)
-            } else {
-                (x / 2) as i64
-            }
-        });
+        // v is odd, compute the four squares for 2*v and deduce the result for v
+        let double = match sanity_check_mode {
+            ProofSanityCheckMode::Panic => v.checked_mul(2).unwrap(),
+            #[cfg(test)]
+            ProofSanityCheckMode::Ignore => v.wrapping_mul(2),
+        };
+        let mut r = four_squares(double, sanity_check_mode);
+
+        // At this point we know that exactly 2 values of r are even and 2 are odd:
+        // r = [w, x, y, z]
+        // 2v = w² + x² + y² + z²
+        // We cannot have 4 even numbers because
+        // 2v = (2w')²+(2x')²+(2y')²+(2z')² = 4v', but v is odd
+        // We cannot have 4 odd numbers because
+        // 2v = (2w'+1)²+(2x'+1)²+(2y'+1)²+(2z'+1)²
+        //    = (4w'²+4w'+1)+(4x'²+4x'+1)+(4y'²+4y'+1)+(4z'²+4z'+1) = 4v', same issue
+        //
+        // Since w² + x² + y² + z² is even we must have 2 of them odd and 2 of them even
+
+        // Sort so that r[0], r[1] are even and r[2], r[3] are odd,
+        // with r[1] > r[0] and r[3] > r[2]
+        r.sort_by_key(|&x| (x % 2 != 0, x));
+
        [
-            (r[0] + r[1]) / 2,
-            (r[0] - r[1]) / 2,
-            (r[3] + r[2]) / 2,
+            // divide by 2 before addition to avoid overflows
+            (r[1] / 2 + r[0] / 2),
+            (r[1] - r[0]) / 2,
+            (r[3] / 2 + r[2] / 2) + 1,
            (r[3] - r[2]) / 2,
        ]
    }
 }
+
+#[cfg(test)]
+mod test {
+    use rand::rngs::StdRng;
+    use rand::{thread_rng, Rng, SeedableRng};
+
+    use super::*;
+
+    fn assert_four_squares(value: u128) {
+        let squares = four_squares(value, ProofSanityCheckMode::Panic);
+
+        let res = squares.iter().map(|x| sqr(*x)).sum();
+        assert_eq!(value, res);
+    }
+
+    #[test]
+    fn test_four_squares() {
+        const RAND_TESTS_COUNT: usize = 1000;
+
+        let seed = thread_rng().gen();
+        println!("four_squares seed: {seed:x}");
+        let rng = &mut StdRng::seed_from_u64(seed);
+
+        for val in 0..256 {
+            assert_four_squares(val);
+        }
+
+        // If v % 4 = 1 or 3, v will be multiplied by 2
+        assert_four_squares(u128::MAX / 2);
+        assert_four_squares(u128::MAX / 2 - 1);
+        assert_four_squares(u128::MAX / 2 - 2);
+        assert_four_squares(u128::MAX / 2 - 3);
+
+        for i in 8..127 {
+            assert_four_squares((1u128 << i) + 1);
+        }
+
+        for _ in 0..RAND_TESTS_COUNT {
+            let v: u128 = rng.gen_range(0..(u128::MAX / 2));
+            assert_four_squares(v);
+        }
+    }
+}
--- a/tfhe-zk-pok/src/proofs/mod.rs
+++ b/tfhe-zk-pok/src/proofs/mod.rs
@@ -144,7 +144,7 @@ impl<G: Curve> GroupElements<G> {

 /// Allows to compute proof with bad inputs for tests
 #[derive(Copy, Clone, PartialEq, Eq)]
-enum ProofSanityCheckMode {
+pub(crate) enum ProofSanityCheckMode {
    Panic,
    #[cfg(test)]
    Ignore,
--- a/tfhe-zk-pok/src/proofs/pke_v2/mod.rs
+++ b/tfhe-zk-pok/src/proofs/pke_v2/mod.rs
@@ -928,7 +928,7 @@ fn prove_impl<G: Curve>(
        )
        .collect::<Box<[_]>>();

-    let v = four_squares(B_squared - e_sqr_norm).map(|v| v as i64);
+    let v = four_squares(B_squared - e_sqr_norm, sanity_check_mode).map(|v| v as i64);

    let e1_zp = &*e1
        .iter()
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -86,7 +86,7 @@ wasm-bindgen-rayon = { version = "1.3.0", optional = true }
 js-sys = { version = "0.3", optional = true }
 console_error_panic_hook = { version = "0.1.7", optional = true }
 serde-wasm-bindgen = { version = "0.6.0", optional = true }
-getrandom = { version = "0.2.8", optional = true }
+getrandom = { workspace = true, optional = true }
 bytemuck = { workspace = true }

 tfhe-hpu-backend = { version = "0.2", path = "../backends/tfhe-hpu-backend", optional = true }
--- a/tfhe/docs/configuration/rust-configuration.md
+++ b/tfhe/docs/configuration/rust-configuration.md
@@ -2,7 +2,7 @@

 This document provides basic instructions to configure the Rust toolchain and features for **TFHE-rs.**

-**TFHE-rs** requires a nightly Rust toolchain to build the C API and utilize advanced SIMD instructions. However, for other uses, a stable toolchain (version 1.81 or later) is sufficient.
+**TFHE-rs** requires a nightly Rust toolchain to build the C API and utilize advanced SIMD instructions. However, for other uses, a stable toolchain (version 1.84 or later) is sufficient.

 Follow the following instructions to install the necessary Rust toolchain:

--- a/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
+++ b/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
@@ -73,15 +73,6 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
 }
 ```

-Performance can be improved by setting `lto="fat"` in `Cargo.toml`
-
-```toml
-[profile.release]
-lto = "fat"
-```
-
-and by building the code for the native CPU architecture and in release mode, e.g. by calling `RUSTFLAGS="-C target-cpu=native" cargo run --release`.
-
 {% hint style="info" %}
 You can choose a more costly proof with `ZkComputeLoad::Proof`, which has a faster verification time. Alternatively, you can select `ZkComputeLoad::Verify` for a faster proof and slower verification.
 {% endhint %}
--- a/tfhe/docs/fhe-computation/compute/parameters.md
+++ b/tfhe/docs/fhe-computation/compute/parameters.md
@@ -7,7 +7,6 @@ All parameter sets provide at least 128-bits of security according to the [Latti
 ## Default parameters
 Currently, the default parameters use blocks that contain 2 bits of message and 2 bits of carry - a tweaked uniform (TUniform, defined [here](../../getting-started/security-and-cryptography.md#noise)) noise distribution, and have a bootstrapping failure probability $$p_{error} \le 2^{-128}$$.
 These are particularly suitable for applications that need to be secure in the IND-CPA^D model (see [here](../../getting-started/security-and-cryptography.md#security) for more details).
-The GPU backend still uses an error probability smaller than $$2^{-64}$$ by default. Those will be updated soon.

 When using the high-level API of **TFHE-rs**, you can create a key pair using the default recommended set of parameters. For example:

--- a/tfhe/docs/getting-started/installation.md
+++ b/tfhe/docs/getting-started/installation.md
@@ -11,7 +11,7 @@ tfhe = { version = "~1.3.0", features = ["boolean", "shortint", "integer"] }
 ```

 {% hint style="info" %}
-**Rust version**: a minimum Rust version of 1.81 is required to compile **TFHE-rs**.
+**Rust version**: a minimum Rust version of 1.84 is required to compile **TFHE-rs**.
 {% endhint %}

 {% hint style="success" %}
--- a/tfhe/docs/getting-started/quick-start.md
+++ b/tfhe/docs/getting-started/quick-start.md
@@ -85,7 +85,8 @@ Now that the project has **TFHE-rs** as a dependency here are the detailed steps
 5. Server-side: [compute over encrypted data](../fhe-computation/compute/)
 6. Client-side: [decrypt data](../fhe-computation/compute/decrypt-data.md)

-This example demonstrates the basic workflow combining the client and server parts:
+This example demonstrates the basic workflow combining the client and server parts.
+Edit the `main.rs` file that has been generated at the previous step and add this content:

 ```rust
 use tfhe::{ConfigBuilder, generate_keys, set_server_key, FheUint8};
@@ -116,4 +117,21 @@ fn main() {
 }
 ```

+Now, you are ready to compute your first homomorphic operations! To run the project, simply invoke the following command:
+```console
+$ cargo run --release
+```
+{% hint style="success" %}
+**Performance**: for optimal performance, it is highly recommended to run code that uses **`TFHE-rs`** in release mode with cargo's `--release` flag.
+{% endhint %}
+
 You can learn more about homomorphic types and associated compilation features in the [configuration documentation.](../configuration/rust-configuration.md)
+
+## Perforance tips
+Performance can be further improved by setting `lto="fat"` in `Cargo.toml`
+```toml
+[profile.release]
+lto = "fat"
+```
+
+If your application does not need to be portable, you can also build the code for the native CPU architecture, e.g. by calling `RUSTFLAGS="-C target-cpu=native" cargo run --release`.
--- a/tfhe/docs/getting-started/security-and-cryptography.md
+++ b/tfhe/docs/getting-started/security-and-cryptography.md
@@ -97,7 +97,7 @@ For example, when adding two ciphertexts, the sum could exceed the range of eith

 By default, the cryptographic parameters provided by **TFHE-rs** ensure at least 128 bits of security. The security has been evaluated using the latest versions of the Lattice Estimator ([repository](https://github.com/malb/lattice-estimator)) with `red_cost_model = reduction.RC.BDGL16`.

-For the High-Level API the default parameters are selected with a bootstrapping failure probability (or error probability) fixed at $$p_{error} \le 2^{-128}$$ for the x86 CPU backend, and $$p_{error} \le 2^{-64}$$ for the GPU backend.
+For the High-Level API the default parameters are selected with a bootstrapping failure probability (or error probability) fixed at $$p_{error} \le 2^{-128}$$ for all backends (x86 CPU, GPU and HPU).
 A failure probability below $$2^{-128}$$ ensures that our implementation is resilient against attacks in the IND-CPA-D model [1]. In the case where only the IND-CPA model is considered, there is a possibility to choose parameters with a $$p_{error} \le 2^{-64}$$, see the dedicated [Parameters section](../fhe-computation/compute/parameters.md)

 \[1][ Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022.](https://eprint.iacr.org/2022/816.pdf)
--- a/tfhe/docs/references/fine-grained-apis/shortint/parameters.md
+++ b/tfhe/docs/references/fine-grained-apis/shortint/parameters.md
@@ -1,6 +1,6 @@
 # Cryptographic Parameters

-All parameter sets provide at least 128-bits of security according to the [Lattice-Estimator](https://github.com/malb/lattice-estimator), with an error probability equal to $$2^{-64}$$ when using programmable bootstrapping. This error probability is due to the randomness added at each encryption (see [here](../../../getting-started/security-and-cryptography.md) for more details about the encryption process).
+All parameter sets provide at least 128-bits of security according to the [Lattice-Estimator](https://github.com/malb/lattice-estimator). Default parameters have an error probability equal to $$2^{-128}$$ when using programmable bootstrapping. This error probability is due to the randomness added at each encryption (see [here](../../../getting-started/security-and-cryptography.md) for more details about the encryption process).

 ## Parameters and message precision

--- a/tfhe/src/core_crypto/algorithms/glwe_encryption.rs
+++ b/tfhe/src/core_crypto/algorithms/glwe_encryption.rs
@@ -728,8 +728,11 @@ pub fn encrypt_glwe_ciphertext_list<
    Gen: ByteRandomGenerator,
 {
    assert!(
-        output_glwe_ciphertext_list.polynomial_size().0
-            * output_glwe_ciphertext_list.glwe_ciphertext_count().0
+        output_glwe_ciphertext_list
+            .polynomial_size()
+            .0
+            .checked_mul(output_glwe_ciphertext_list.glwe_ciphertext_count().0)
+            .unwrap()
            == input_plaintext_list.plaintext_count().0,
        "Mismatch between required number of plaintexts: {} ({:?} * {:?}) and input \
        PlaintextCount: {:?}",
@@ -1349,7 +1352,11 @@ pub fn encrypt_seeded_glwe_ciphertext_list_with_pre_seeded_generator<
        output.glwe_size().to_glwe_dimension(),
    );
    assert!(
-        output.glwe_ciphertext_count().0 * output.polynomial_size().0
+        output
+            .glwe_ciphertext_count()
+            .0
+            .checked_mul(output.polynomial_size().0)
+            .unwrap()
            == encoded.plaintext_count().0,
        "Mismatch between number of output ciphertexts and input plaintexts. \
        Got {:?} plaintexts while {:?} plaintexts are required to encrypt {:?} ciphertexts.",
--- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs
@@ -7,7 +7,7 @@ use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
 use crate::core_crypto::commons::math::ntt::ntt64::{Ntt64, Ntt64View};
 use crate::core_crypto::commons::parameters::{GlweSize, MonomialDegree, PolynomialSize};
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::prelude::{lwe_ciphertext_modulus_switch, ModulusSwitchedLweCiphertext};
 use aligned_vec::CACHELINE_ALIGN;
@@ -203,6 +203,9 @@ pub fn blind_rotate_ntt64_bnf_assign<OutputCont, KeyCont>(
 /// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a
 /// capacity at least as large as the result of
 /// [`blind_rotate_ntt64_bnf_assign_mem_optimized_requirement`].
+///
+/// # Panics
+/// This will panic if the input mask len does not match the size of the bsk
 pub fn blind_rotate_ntt64_bnf_assign_mem_optimized<OutputCont, KeyCont>(
    msed_input: &impl ModulusSwitchedLweCiphertext<usize>,
    lut: &mut GlweCiphertext<OutputCont>,
@@ -234,7 +237,8 @@ pub fn blind_rotate_ntt64_bnf_assign_mem_optimized<OutputCont, KeyCont>(
        // We initialize the ct_0 used for the successive cmuxes
        let mut ct0 = lut;

-        for (lwe_mask_element, bootstrap_key_ggsw) in izip!(msed_lwe_mask, bsk.into_ggsw_iter()) {
+        for (lwe_mask_element, bootstrap_key_ggsw) in izip_eq!(msed_lwe_mask, bsk.into_ggsw_iter())
+        {
            if lwe_mask_element != 0 {
                // We copy ct_0 to ct_1
                let (ct1, stack) =
@@ -623,7 +627,7 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
                //
                //        t = 1                           t = 2                     ...

-                izip!(
+                izip_eq!(
                    ggsw_decomp_matrix.into_rows(),
                    glwe_decomp_term.as_polynomial_list().iter()
                )
@@ -657,7 +661,7 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
        //
        // We iterate over the polynomials in the output.
        if !is_output_uninit {
-            izip!(
+            izip_eq!(
                out.as_mut_polynomial_list().iter_mut(),
                output_fft_buffer
                    .into_chunks(poly_size)
@@ -675,6 +679,9 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
 }

 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
+///
+/// # Panics
+/// This will panic if ct0 and ct1 are not of the same size
 pub(crate) fn cmux_ntt64_bnf_assign(
    ct0: GlweCiphertextMutView<'_, u64>,
    ct1: GlweCiphertextMutView<'_, u64>,
@@ -689,7 +696,7 @@ pub(crate) fn cmux_ntt64_bnf_assign(
        ntt: Ntt64View<'_>,
        stack: &mut PodStack,
    ) {
-        izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
+        izip_eq!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
            *c1 = c1.wrapping_sub(*c0);
        });
        add_external_product_ntt64_bnf_assign(ct0, ggsw, &ct1, ntt, stack);
@@ -711,7 +718,7 @@ pub(crate) fn update_with_fmadd_ntt64_bnf(
        output_fft_buffer.fill(0);
    }

-    izip!(
+    izip_eq!(
        output_fft_buffer.into_chunks(poly_size),
        lhs_polynomial_list.into_chunks(poly_size)
    )
--- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs
@@ -15,7 +15,7 @@ use crate::core_crypto::commons::math::decomposition::{
 use crate::core_crypto::commons::math::ntt::ntt64::{Ntt64, Ntt64View};
 use crate::core_crypto::commons::parameters::{GlweSize, MonomialDegree, PolynomialSize};
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use aligned_vec::CACHELINE_ALIGN;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -208,6 +208,9 @@ pub fn blind_rotate_ntt64_assign<InputCont, OutputCont, KeyCont>(
 /// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a
 /// capacity at least as large as the result of
 /// [`blind_rotate_ntt64_assign_mem_optimized_requirement`].
+///
+/// # Panics
+/// This will panic if the input mask len does not match the size of the bsk
 pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
    input: &LweCiphertext<InputCont>,
    lut: &mut GlweCiphertext<OutputCont>,
@@ -250,7 +253,9 @@ pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
        // We initialize the ct_0 used for the successive cmuxes
        let mut ct0 = lut;

-        for (lwe_mask_element, bootstrap_key_ggsw) in izip!(lwe_mask.iter(), bsk.into_ggsw_iter()) {
+        for (lwe_mask_element, bootstrap_key_ggsw) in
+            izip_eq!(lwe_mask.iter(), bsk.into_ggsw_iter())
+        {
            if *lwe_mask_element != 0u64 {
                let stack = &mut *stack;
                // We copy ct_0 to ct_1
@@ -615,7 +620,7 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
            //
            //        t = 1                           t = 2                     ...

-            izip!(
+            izip_eq!(
                ggsw_decomp_matrix.into_rows(),
                glwe_decomp_term.as_polynomial_list().iter()
            )
@@ -647,7 +652,7 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
    //
    // We iterate over the polynomials in the output.
    if !is_output_uninit {
-        izip!(
+        izip_eq!(
            out.as_mut_polynomial_list().iter_mut(),
            output_fft_buffer
                .into_chunks(poly_size)
@@ -660,6 +665,9 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
 }

 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
+///
+/// # Panics
+/// This will panic if ct0 and ct1 are not of the same size
 pub(crate) fn cmux_ntt64_assign(
    ct0: GlweCiphertextMutView<'_, u64>,
    mut ct1: GlweCiphertextMutView<'_, u64>,
@@ -667,7 +675,7 @@ pub(crate) fn cmux_ntt64_assign(
    ntt: Ntt64View<'_>,
    stack: &mut PodStack,
 ) {
-    izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
+    izip_eq!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
        *c1 = c1.wrapping_sub_custom_mod(*c0, ntt.custom_modulus());
    });
    add_external_product_ntt64_assign(ct0, ggsw, &ct1, ntt, stack);
@@ -686,7 +694,7 @@ pub(crate) fn update_with_fmadd_ntt64(
        output_fft_buffer.fill(0);
    }

-    izip!(
+    izip_eq!(
        output_fft_buffer.into_chunks(poly_size),
        lhs_polynomial_list.into_chunks(poly_size)
    )
--- a/tfhe/src/core_crypto/algorithms/modulus_switch.rs
+++ b/tfhe/src/core_crypto/algorithms/modulus_switch.rs
@@ -5,7 +5,7 @@ pub trait ModulusSwitchedLweCiphertext<Scalar> {
    fn log_modulus(&self) -> CiphertextModulusLog;
    fn lwe_dimension(&self) -> LweDimension;
    fn body(&self) -> Scalar;
-    fn mask(&self) -> impl Iterator<Item = Scalar> + '_;
+    fn mask(&self) -> impl ExactSizeIterator<Item = Scalar> + '_;
 }

 pub fn lwe_ciphertext_modulus_switch<Scalar, SwitchedScalar, Cont>(
--- a/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs
+++ b/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::commons::ciphertext_modulus::CiphertextModulusKind;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::prelude::*;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
@@ -123,7 +123,7 @@ impl Ntt64View<'_> {
        pulp::Arch::new().dispatch(
            #[inline(always)]
            || {
-                for (out, inp) in izip!(standard, &*ntt) {
+                for (out, inp) in izip_eq!(standard, &*ntt) {
                    *out = u64::wrapping_add_custom_mod(*out, *inp, self.custom_modulus());
                }
            },
@@ -258,7 +258,7 @@ impl Ntt64View<'_> {
        pulp::Arch::new().dispatch(
            #[inline(always)]
            || {
-                for (out, inp) in izip!(standard, &*ntt) {
+                for (out, inp) in izip_eq!(standard, &*ntt) {
                    *out = u64::wrapping_add(*out, *inp);
                }
            },
--- a/tfhe/src/core_crypto/commons/traits/contiguous_entity_container.rs
+++ b/tfhe/src/core_crypto/commons/traits/contiguous_entity_container.rs
@@ -141,7 +141,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
        // mid here is the number of ref_elements, we need to multiply by the size of a single
        // element to know where to split the underlying container

-        let mid = mid * self.get_entity_view_pod_size();
+        let mid = mid.checked_mul(self.get_entity_view_pod_size()).unwrap();
        let self_meta = self.get_self_view_creation_metadata();

        let (container_left, container_right) = self.as_ref().split_at(mid);
@@ -156,7 +156,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
        // index here is the number of ref_elements, we need to multiply by the size of a single
        // element to know where to reference the underlying container

-        let start = index * self.get_entity_view_pod_size();
+        let start = index.checked_mul(self.get_entity_view_pod_size()).unwrap();
        let stop = start + self.get_entity_view_pod_size();
        let meta = self.get_entity_view_creation_metadata();

@@ -178,8 +178,12 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
            Bound::Unbounded => self.entity_count(),
        };

-        let start_index = entity_start_index * self.get_entity_view_pod_size();
-        let stop_index = entity_stop_index * self.get_entity_view_pod_size();
+        let start_index = entity_start_index
+            .checked_mul(self.get_entity_view_pod_size())
+            .unwrap();
+        let stop_index = entity_stop_index
+            .checked_mul(self.get_entity_view_pod_size())
+            .unwrap();

        let self_meta = self.get_self_view_creation_metadata();

@@ -214,7 +218,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
        let entity_count = self.entity_count();

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_ref()
@@ -235,7 +239,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
        );

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_ref()
@@ -273,7 +277,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
        let entity_count = self.entity_count();

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_ref()
@@ -299,7 +303,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
        );

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_ref()
@@ -386,8 +390,12 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
            Bound::Unbounded => self.entity_count(),
        };

-        let start_index = entity_start_index * self.get_entity_view_pod_size();
-        let stop_index = entity_stop_index * self.get_entity_view_pod_size();
+        let start_index = entity_start_index
+            .checked_mul(self.get_entity_view_pod_size())
+            .unwrap();
+        let stop_index = entity_stop_index
+            .checked_mul(self.get_entity_view_pod_size())
+            .unwrap();

        let self_meta = self.get_self_view_creation_metadata();

@@ -412,7 +420,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
        let entity_count = self.entity_count();

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_mut()
@@ -434,7 +442,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
        );

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_mut()
@@ -487,7 +495,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
        let entity_count = self.entity_count();

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_mut()
@@ -513,7 +521,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
        );

        let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size * chunk_size;
+        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();

        let meta = self.get_self_view_creation_metadata();
        self.as_mut()
--- a/tfhe/src/core_crypto/commons/utils.rs
+++ b/tfhe/src/core_crypto/commons/utils.rs
@@ -1,32 +1,30 @@
 //! Utilities for the library.

-#[track_caller]
-#[inline]
-fn assert_same_len(a: (usize, Option<usize>), b: (usize, Option<usize>)) {
-    debug_assert_eq!(a.1, Some(a.0));
-    debug_assert_eq!(b.1, Some(b.0));
-    debug_assert_eq!(a.0, b.0);
-}
-
 /// Return a Zip iterator, but checks that the two components have the same length.
-pub trait ZipChecked: IntoIterator + Sized {
+pub trait ZipChecked: IntoIterator + Sized
+where
+    <Self as IntoIterator>::IntoIter: ExactSizeIterator,
+{
    #[track_caller]
    #[inline]
    fn zip_checked<B: IntoIterator>(
        self,
        b: B,
-    ) -> core::iter::Zip<<Self as IntoIterator>::IntoIter, <B as IntoIterator>::IntoIter> {
+    ) -> core::iter::Zip<<Self as IntoIterator>::IntoIter, <B as IntoIterator>::IntoIter>
+    where
+        <B as IntoIterator>::IntoIter: ExactSizeIterator,
+    {
        let a = self.into_iter();
        let b = b.into_iter();
-        assert_same_len(a.size_hint(), b.size_hint());
+        assert_eq!(a.len(), b.len());
        core::iter::zip(a, b)
    }
 }

-impl<A: IntoIterator> ZipChecked for A {}
+impl<A: IntoIterator> ZipChecked for A where <A as IntoIterator>::IntoIter: ExactSizeIterator {}

 // https://docs.rs/itertools/0.7.8/src/itertools/lib.rs.html#247-269
-macro_rules! izip {
+macro_rules! izip_eq {
    (@ __closure @ ($a:expr)) => { |a| (a,) };
    (@ __closure @ ($a:expr, $b:expr)) => { |(a, b)| (a, b) };
    (@ __closure @ ($a:expr, $b:expr, $c:expr)) => { |((a, b), c)| (a, b, c) };
@@ -54,11 +52,11 @@ macro_rules! izip {
        {
            #[allow(unused_imports)]
            use $crate::core_crypto::commons::utils::ZipChecked;
-            ::core::iter::IntoIterator::into_iter($first)
+            $first
                $(.zip_checked($rest))*
-                .map($crate::core_crypto::commons::utils::izip!(@ __closure @ ($first, $($rest),*)))
+                .map($crate::core_crypto::commons::utils::izip_eq!(@ __closure @ ($first, $($rest),*)))
        }
    };
 }

-pub(crate) use izip;
+pub(crate) use izip_eq;
--- a/tfhe/src/core_crypto/entities/glwe_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/glwe_ciphertext.rs
@@ -255,9 +255,6 @@ pub fn glwe_ciphertext_encryption_noise_sample_count(

 /// A [`GLWE ciphertext`](`GlweCiphertext`).
 ///
-/// **Remark:** GLWE ciphertexts generalize LWE ciphertexts by definition, however in this library,
-/// GLWE ciphertext entities do not generalize LWE ciphertexts, i.e., polynomial size cannot be 1.
-///
 /// # Formal Definition
 ///
 /// ## GLWE Ciphertext
--- a/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key/fft128_lwe_multi_bit_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key/fft128_lwe_multi_bit_bootstrap_key.rs
@@ -6,7 +6,7 @@ use crate::core_crypto::commons::parameters::{
    PolynomialSize,
 };
 use crate::core_crypto::commons::traits::{Container, Split};
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::fft_impl::fft128::crypto::ggsw::Fourier128GgswCiphertext;
 use crate::core_crypto::prelude::MultiBitBootstrapKeyConformanceParams;

@@ -77,7 +77,7 @@ impl<C: Container<Element = f64>> Fourier128LweMultiBitBootstrapKey<C> {
        let ggsw_count =
            multi_bit_lwe_dim.0 * self.grouping_factor().ggsw_per_multi_bit_element().0;

-        izip!(
+        izip_eq!(
            self.data_re0.split_into(ggsw_count),
            self.data_re1.split_into(ggsw_count),
            self.data_im0.split_into(ggsw_count),
--- a/tfhe/src/core_crypto/entities/modulus_switched_lwe_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/modulus_switched_lwe_ciphertext.rs
@@ -52,7 +52,7 @@ impl<Scalar: Copy> ModulusSwitchedLweCiphertext<Scalar>
        *self.container.last().unwrap()
    }

-    fn mask(&self) -> impl Iterator<Item = Scalar> + '_ {
+    fn mask(&self) -> impl ExactSizeIterator<Item = Scalar> + '_ {
        let (_body, mask) = self.container.split_last().unwrap();

        mask.iter().copied()
@@ -138,7 +138,7 @@ where
        .cast_into()
    }

-    fn mask(&self) -> impl Iterator<Item = SwitchedScalar> {
+    fn mask(&self) -> impl ExactSizeIterator<Item = SwitchedScalar> {
        self.lwe_in
            .as_ref()
            .split_last()
--- a/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext.rs
@@ -288,7 +288,9 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttGgswLevelMatrix
    }

    /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = NttGgswLevelRow<C>>
+    pub fn into_rows(
+        self,
+    ) -> impl DoubleEndedIterator<Item = NttGgswLevelRow<C>> + ExactSizeIterator<Item = NttGgswLevelRow<C>>
    where
        C: Split,
    {
--- a/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext_list.rs
@@ -207,7 +207,10 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttGgswCiphertextL
    /// consider calling [`NttGgswCiphertextList::as_view`] or
    /// [`NttGgswCiphertextList::as_mut_view`] first to have an iterator over borrowed contents
    /// instead of consuming the original entity.
-    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
+    pub fn into_ggsw_iter(
+        self,
+    ) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
+           + ExactSizeIterator<Item = NttGgswCiphertext<C>>
    where
        C: Split,
    {
--- a/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs
@@ -130,7 +130,10 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttLweBootstrapKey
    /// consider calling [`NttLweBootstrapKey::as_view`] or
    /// [`NttLweBootstrapKey::as_mut_view`] first to have an iterator over borrowed contents
    /// instead of consuming the original entity.
-    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
+    pub fn into_ggsw_iter(
+        self,
+    ) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
+           + ExactSizeIterator<Item = NttGgswCiphertext<C>>
    where
        C: Split,
    {
--- a/tfhe/src/core_crypto/experimental/algorithms/glwe_fast_keyswitch.rs
+++ b/tfhe/src/core_crypto/experimental/algorithms/glwe_fast_keyswitch.rs
@@ -3,7 +3,7 @@
 use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::experimental::entities::fourier_pseudo_ggsw_ciphertext::{
    PseudoFourierGgswCiphertext, PseudoFourierGgswCiphertextView,
@@ -237,7 +237,7 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
                //
                //        t = 1                           t = 2                     ...

-                izip!(
+                izip_eq!(
                    ggsw_decomp_matrix.into_rows(),
                    glwe_decomp_term.get_mask().as_polynomial_list().iter()
                )
@@ -276,7 +276,7 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
        //
        // We iterate over the polynomials in the output.
        if !is_output_uninit {
-            izip!(
+            izip_eq!(
                out.as_mut_polynomial_list().iter_mut(),
                output_fft_buffer
                    .into_chunks(fourier_poly_size)
--- a/tfhe/src/core_crypto/experimental/entities/fourier_pseudo_ggsw_ciphertext.rs
+++ b/tfhe/src/core_crypto/experimental/entities/fourier_pseudo_ggsw_ciphertext.rs
@@ -5,7 +5,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
    Container, ContiguousEntityContainer, IntoContainerOwned, Split,
 };
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::experimental::entities::PseudoGgswCiphertext;
 use crate::core_crypto::fft_impl::fft64::math::decomposition::DecompositionLevel;
 use crate::core_crypto::fft_impl::fft64::math::fft::{FftView, FourierPolynomialList};
@@ -159,7 +159,10 @@ impl<C: Container<Element = c64>> PseudoFourierGgswLevelMatrix<C> {
    }

    /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = PseudoFourierGgswLevelRow<C>>
+    pub fn into_rows(
+        self,
+    ) -> impl DoubleEndedIterator<Item = PseudoFourierGgswLevelRow<C>>
+           + ExactSizeIterator<Item = PseudoFourierGgswLevelRow<C>>
    where
        C: Split,
    {
@@ -278,7 +281,7 @@ impl PseudoFourierGgswCiphertextMutView<'_> {
        debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
        let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;

-        for (fourier_poly, coef_poly) in izip!(
+        for (fourier_poly, coef_poly) in izip_eq!(
            self.data().into_chunks(fourier_poly_size),
            coef_ggsw.as_polynomial_list().iter()
        ) {
--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
@@ -14,7 +14,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
    Container, ContiguousEntityContainer, ContiguousEntityContainerMut, Split,
 };
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::ggsw_ciphertext::fourier_ggsw_ciphertext_size;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
@@ -78,11 +78,14 @@ impl<C: Container<Element = f64>> Fourier128LweBootstrapKey<C> {
    }

    /// Return an iterator over the GGSW ciphertexts composing the key.
-    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = Fourier128GgswCiphertext<C>>
+    pub fn into_ggsw_iter(
+        self,
+    ) -> impl DoubleEndedIterator<Item = Fourier128GgswCiphertext<C>>
+           + ExactSizeIterator<Item = Fourier128GgswCiphertext<C>>
    where
        C: Split,
    {
-        izip!(
+        izip_eq!(
            self.data_re0.split_into(self.input_lwe_dimension.0),
            self.data_re1.split_into(self.input_lwe_dimension.0),
            self.data_im0.split_into(self.input_lwe_dimension.0),
@@ -216,7 +219,9 @@ where
            coef_bsk: LweBootstrapKey<&[Scalar]>,
            fft: Fft128View<'_>,
        ) {
-            for (mut fourier_ggsw, standard_ggsw) in izip!(this.into_ggsw_iter(), coef_bsk.iter()) {
+            for (mut fourier_ggsw, standard_ggsw) in
+                izip_eq!(this.into_ggsw_iter(), coef_bsk.iter())
+            {
                fourier_ggsw.fill_with_forward_fourier(&standard_ggsw, fft);
            }
        }
@@ -287,7 +292,7 @@ where
            let mut ct0 = lut;

            for (lwe_mask_element, bootstrap_key_ggsw) in
-                izip!(msed_lwe_mask, this.into_ggsw_iter())
+                izip_eq!(msed_lwe_mask, this.into_ggsw_iter())
            {
                if lwe_mask_element != 0 {
                    let stack = &mut *stack;
--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
@@ -8,7 +8,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
    Container, ContiguousEntityContainer, ContiguousEntityContainerMut, Split,
 };
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::ggsw_ciphertext::{
    fourier_ggsw_ciphertext_size, fourier_ggsw_level_matrix_size, GgswCiphertext,
 };
@@ -113,6 +113,9 @@ impl<C: Container<Element = f64>> Fourier128GgswCiphertext<C> {
            decomposition_level_count,
        );
        assert_eq!(data_re0.container_len(), container_len);
+        assert_eq!(data_re1.container_len(), container_len);
+        assert_eq!(data_im0.container_len(), container_len);
+        assert_eq!(data_im1.container_len(), container_len);

        Self {
            data_re0,
@@ -184,7 +187,7 @@ impl<C: Container<Element = f64>> Fourier128GgswCiphertext<C> {
        C: Split,
    {
        let decomposition_level_count = self.decomposition_level_count.0;
-        izip!(
+        izip_eq!(
            self.data_re0.split_into(decomposition_level_count),
            self.data_re1.split_into(decomposition_level_count),
            self.data_im0.split_into(decomposition_level_count),
@@ -235,12 +238,15 @@ impl<C: Container<Element = f64>> Fourier128GgswLevelMatrix<C> {
    }

    /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = Fourier128GgswLevelRow<C>>
+    pub fn into_rows(
+        self,
+    ) -> impl DoubleEndedIterator<Item = Fourier128GgswLevelRow<C>>
+           + ExactSizeIterator<Item = Fourier128GgswLevelRow<C>>
    where
        C: Split,
    {
        let row_count = self.row_count();
-        izip!(
+        izip_eq!(
            self.data_re0.split_into(row_count),
            self.data_re1.split_into(row_count),
            self.data_im0.split_into(row_count),
@@ -349,7 +355,7 @@ where

            let (data_re0, data_re1, data_im0, data_im1) = this.data();

-            for (fourier_re0, fourier_re1, fourier_im0, fourier_im1, coef_poly) in izip!(
+            for (fourier_re0, fourier_re1, fourier_im0, fourier_im1, coef_poly) in izip_eq!(
                data_re0.into_chunks(poly_size),
                data_re1.into_chunks(poly_size),
                data_im0.into_chunks(poly_size),
@@ -485,7 +491,7 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
                //
                //        t = 1                           t = 2                     ...

-                for (ggsw_row, glwe_poly) in izip!(
+                for (ggsw_row, glwe_poly) in izip_eq!(
                    ggsw_decomp_matrix.into_rows(),
                    glwe_decomp_term.as_polynomial_list().iter()
                ) {
@@ -531,7 +537,7 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
        //
        // We iterate over the polynomials in the output.
        if !is_output_uninit {
-            for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
+            for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip_eq!(
                out.as_mut_polynomial_list().iter_mut(),
                output_fft_buffer_re0.into_chunks(fourier_poly_size),
                output_fft_buffer_re1.into_chunks(fourier_poly_size),
@@ -604,7 +610,7 @@ fn update_with_fmadd_scalar(
            rhs_re1,
            rhs_im0,
            rhs_im1,
-        ) in izip!(
+        ) in izip_eq!(
            output_fourier_re0,
            output_fourier_re1,
            output_fourier_im0,
@@ -647,7 +653,7 @@ fn update_with_fmadd_scalar(
            rhs_re1,
            rhs_im0,
            rhs_im1,
-        ) in izip!(
+        ) in izip_eq!(
            output_fourier_re0,
            output_fourier_re1,
            output_fourier_im0,
@@ -703,7 +709,7 @@ pub fn update_with_fmadd(
        ggsw_poly_re1,
        ggsw_poly_im0,
        ggsw_poly_im1,
-    ) in izip!(
+    ) in izip_eq!(
        output_fft_buffer_re0.into_chunks(fourier_poly_size),
        output_fft_buffer_re1.into_chunks(fourier_poly_size),
        output_fft_buffer_im0.into_chunks(fourier_poly_size),
@@ -797,6 +803,9 @@ pub fn cmux_scratch<Scalar>(
 }

 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
+///
+/// # Panics
+/// This will panic if ct0 and ct1 are not of the same size
 pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
    ct0: &mut GlweCiphertext<ContCt0>,
    ct1: &mut GlweCiphertext<ContCt1>,
@@ -816,7 +825,7 @@ pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
        fft: Fft128View<'_>,
        stack: &mut PodStack,
    ) {
-        for (c1, c0) in izip!(ct1.as_mut(), ct0.as_ref()) {
+        for (c1, c0) in izip_eq!(ct1.as_mut(), ct0.as_ref()) {
            *c1 = c1.wrapping_sub(*c0);
        }
        add_external_product_assign(&mut ct0, &ggsw, &ct1, fft, stack);
--- a/tfhe/src/core_crypto/fft_impl/fft128/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/math/fft/mod.rs
@@ -1,7 +1,7 @@
 use crate::core_crypto::commons::math::torus::UnsignedTorus;
 use crate::core_crypto::commons::numeric::{CastFrom, CastInto, UnsignedInteger};
 use crate::core_crypto::commons::parameters::PolynomialSize;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use core::any::TypeId;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
 use std::collections::hash_map::Entry;
@@ -277,7 +277,7 @@ pub fn convert_forward_torus<Scalar: UnsignedTorus>(
    let normalization = 2.0_f64.powi(-(Scalar::BITS as i32));

    for (out_re0, out_re1, out_im0, out_im1, &in_re, &in_im) in
-        izip!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
+        izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
    {
        let out_re = to_signed_to_f128(in_re);
        let out_im = to_signed_to_f128(in_im);
@@ -301,7 +301,7 @@ pub fn convert_forward_integer<Scalar: UnsignedTorus>(
    in_im: &[Scalar],
 ) {
    for (out_re0, out_re1, out_im0, out_im1, &in_re, &in_im) in
-        izip!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
+        izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
    {
        let out_re = to_signed_to_f128(in_re);
        let out_im = to_signed_to_f128(in_im);
@@ -323,7 +323,7 @@ fn convert_add_backward_torus<Scalar: UnsignedTorus>(
 ) {
    let norm = 1.0 / in_re0.len() as f64;
    for (out_re, out_im, in_re0, in_re1, in_im0, in_im1) in
-        izip!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
+        izip_eq!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
    {
        let in_re = f128(*in_re0 * norm, *in_re1 * norm);
        let in_im = f128(*in_im0 * norm, *in_im1 * norm);
@@ -343,7 +343,7 @@ fn convert_backward_torus<Scalar: UnsignedTorus>(
 ) {
    let norm = 1.0 / in_re0.len() as f64;
    for (out_re, out_im, in_re0, in_re1, in_im0, in_im1) in
-        izip!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
+        izip_eq!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
    {
        let in_re = f128(*in_re0 * norm, *in_re1 * norm);
        let in_im = f128(*in_im0 * norm, *in_im1 * norm);
--- a/tfhe/src/core_crypto/fft_impl/fft128/math/fft/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/math/fft/tests.rs
@@ -42,7 +42,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
            stack,
        );

-        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
            if Scalar::BITS <= 64 {
                assert_eq!(*expected, *actual);
            } else {
@@ -103,7 +103,7 @@ fn test_product<Scalar: UnsignedTorus>() {
            let mut fourier1_im1 = avec![0.0f64; fourier_size].into_boxed_slice();

            let integer_magnitude = 16;
-            for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
+            for (x, y) in izip_eq!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
                *x = generator.random_uniform();
                *y = generator.random_uniform();

@@ -128,7 +128,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                &poly1,
            );

-            for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip!(
+            for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip_eq!(
                &mut *fourier0_re0,
                &mut *fourier0_re1,
                &mut *fourier0_im0,
@@ -161,7 +161,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                poly1.as_ref(),
            );

-            for (expected, actual) in izip!(
+            for (expected, actual) in izip_eq!(
                convolution_from_naive.as_ref().iter(),
                convolution_from_fft.as_ref().iter()
            ) {
--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
@@ -6,7 +6,7 @@ use crate::core_crypto::commons::parameters::{
    CiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, MonomialDegree,
 };
 use crate::core_crypto::commons::traits::ContiguousEntityContainerMut;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::prelude::{Container, ContainerMut, ModulusSwitchedLweCiphertext};
 use aligned_vec::CACHELINE_ALIGN;
@@ -21,13 +21,13 @@ pub fn polynomial_wrapping_monic_monomial_mul_assign_split(
    let output_hi = output_hi.into_container();
    let full_cycles_count = monomial_degree.0 / output_lo.container_len();
    if full_cycles_count % 2 != 0 {
-        izip!(&mut *output_lo, &mut *output_hi)
+        izip_eq!(&mut *output_lo, &mut *output_hi)
            .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
    }
    let remaining_degree = monomial_degree.0 % output_lo.container_len();
    output_lo.rotate_right(remaining_degree);
    output_hi.rotate_right(remaining_degree);
-    izip!(output_lo, output_hi)
+    izip_eq!(output_lo, output_hi)
        .take(remaining_degree)
        .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
 }
@@ -41,13 +41,13 @@ pub fn polynomial_wrapping_monic_monomial_div_assign_split(
    let output_hi = output_hi.into_container();
    let full_cycles_count = monomial_degree.0 / output_lo.container_len();
    if full_cycles_count % 2 != 0 {
-        izip!(&mut *output_lo, &mut *output_hi)
+        izip_eq!(&mut *output_lo, &mut *output_hi)
            .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
    }
    let remaining_degree = monomial_degree.0 % output_lo.container_len();
    output_lo.rotate_left(remaining_degree);
    output_hi.rotate_left(remaining_degree);
-    izip!(output_lo, output_hi)
+    izip_eq!(output_lo, output_hi)
        .rev()
        .take(remaining_degree)
        .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
@@ -79,7 +79,7 @@ where
            let msed_lwe_mask = msed_lwe.mask();
            let msed_lwe_body = msed_lwe.body();

-            for (poly_lo, poly_hi) in izip!(
+            for (poly_lo, poly_hi) in izip_eq!(
                lut_lo.as_mut_polynomial_list().iter_mut(),
                lut_hi.as_mut_polynomial_list().iter_mut(),
            ) {
@@ -95,7 +95,7 @@ where
            let mut ct0_hi = lut_hi;

            for (lwe_mask_element, bootstrap_key_ggsw) in
-                izip!(msed_lwe_mask, this.into_ggsw_iter())
+                izip_eq!(msed_lwe_mask, this.into_ggsw_iter())
            {
                if lwe_mask_element != 0 {
                    let stack = &mut *stack;
@@ -116,7 +116,7 @@ where
                    );

                    // We rotate ct_1 by performing ct_1 <- ct_1 * X^{a_hat}
-                    for (poly_lo, poly_hi) in izip!(
+                    for (poly_lo, poly_hi) in izip_eq!(
                        ct1_lo.as_mut_polynomial_list().iter_mut(),
                        ct1_hi.as_mut_polynomial_list().iter_mut(),
                    ) {
@@ -204,7 +204,7 @@ where
            );
            let (local_accumulator, _) = stack.collect_aligned(
                align,
-                izip!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
+                izip_eq!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
                    .map(|(&lo, &hi)| lo as u128 | ((hi as u128) << 64)),
            );
            let mut local_accumulator = GlweCiphertextMutView::from_container(
--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/ggsw.rs
@@ -4,7 +4,7 @@ use crate::core_crypto::commons::traits::container::Split;
 use crate::core_crypto::commons::traits::contiguous_entity_container::{
    ContiguousEntityContainer, ContiguousEntityContainerMut,
 };
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::fft128::crypto::ggsw::update_with_fmadd;
 use crate::core_crypto::prelude::{Container, ContainerMut, SignedDecomposer};
@@ -86,7 +86,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
            let (decomposition_states_hi, substack1) =
                stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);

-            for (out_lo, out_hi, in_lo, in_hi) in izip!(
+            for (out_lo, out_hi, in_lo, in_hi) in izip_eq!(
                &mut *decomposition_states_lo,
                &mut *decomposition_states_hi,
                glwe_lo.as_ref(),
@@ -153,7 +153,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
                //
                //        t = 1                           t = 2                     ...

-                for (ggsw_row, glwe_poly_lo, glwe_poly_hi) in izip!(
+                for (ggsw_row, glwe_poly_lo, glwe_poly_hi) in izip_eq!(
                    ggsw_decomp_matrix.into_rows(),
                    glwe_decomp_term_lo.as_polynomial_list().iter(),
                    glwe_decomp_term_hi.as_polynomial_list().iter(),
@@ -202,7 +202,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
        //
        // We iterate over the polynomials in the output.
        if !is_output_uninit {
-            for (mut out_lo, mut out_hi, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
+            for (mut out_lo, mut out_hi, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip_eq!(
                out_lo.as_mut_polynomial_list().iter_mut(),
                out_hi.as_mut_polynomial_list().iter_mut(),
                output_fft_buffer_re0.into_chunks(fourier_poly_size),
@@ -293,7 +293,7 @@ fn collect_next_term_split_avx512(
            let base_log_complement = simd.splat_u64x8(64u64.wrapping_sub(base_log));
            let base_log = simd.splat_u64x8(base_log);

-            for (out_lo, out_hi, state_lo, state_hi) in izip!(
+            for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
                glwe_decomp_term_lo,
                glwe_decomp_term_hi,
                decomposition_states_lo,
@@ -425,7 +425,7 @@ fn collect_next_term_split_avx2(
            let base_log_complement = simd.splat_u64x4(64u64.wrapping_sub(base_log));
            let base_log = simd.splat_u64x4(base_log);

-            for (out_lo, out_hi, state_lo, state_hi) in izip!(
+            for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
                glwe_decomp_term_lo,
                glwe_decomp_term_hi,
                decomposition_states_lo,
@@ -507,7 +507,7 @@ fn collect_next_term_split_scalar(
    base_log: usize,
 ) {
    assert!(base_log < 128);
-    for (out_lo, out_hi, state_lo, state_hi) in izip!(
+    for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
        glwe_decomp_term_lo,
        glwe_decomp_term_hi,
        decomposition_states_lo,
@@ -603,6 +603,9 @@ fn collect_next_term_split(
 }

 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
+///
+/// # Panics
+/// This will panic if ct0_lo, ct0_hi, ct1_lo and ct1_hi are not of the same size
 pub fn cmux_split<ContCt0Lo, ContCt0Hi, ContCt1Lo, ContCt1Hi, ContGgsw>(
    ct0_lo: &mut GlweCiphertext<ContCt0Lo>,
    ct0_hi: &mut GlweCiphertext<ContCt0Hi>,
@@ -627,7 +630,7 @@ pub fn cmux_split<ContCt0Lo, ContCt0Hi, ContCt1Lo, ContCt1Hi, ContGgsw>(
        fft: Fft128View<'_>,
        stack: &mut PodStack,
    ) {
-        for (c1_lo, c1_hi, c0_lo, c0_hi) in izip!(
+        for (c1_lo, c1_hi, c0_lo, c0_hi) in izip_eq!(
            ct1_lo.as_mut(),
            ct1_hi.as_mut(),
            ct0_lo.as_ref(),
--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/math/fft/mod.rs
@@ -1,4 +1,4 @@
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 pub use crate::core_crypto::fft_impl::fft128::math::fft::Fft128View;
 use dyn_stack::PodStack;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -819,7 +819,7 @@ pub fn convert_forward_integer_avx2(
            let in_im_hi = pulp::as_arrays::<4, _>(in_im_hi).0;

            for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
-                izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
+                izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
            {
                let out_re =
                    to_signed_to_f128_avx2(simd, (pulp::cast(*in_re_lo), pulp::cast(*in_re_hi)));
@@ -899,7 +899,7 @@ pub fn convert_forward_integer_avx512(
            let in_im_hi = pulp::as_arrays::<8, _>(in_im_hi).0;

            for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
-                izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
+                izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
            {
                let out_re =
                    to_signed_to_f128_avx512(simd, (pulp::cast(*in_re_lo), pulp::cast(*in_re_hi)));
@@ -938,7 +938,7 @@ pub fn convert_forward_integer_scalar(
    in_im_hi: &[u64],
 ) {
    for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
-        izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
+        izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
    {
        let out_re = to_signed_to_f128((*in_re_lo, *in_re_hi));
        let out_im = to_signed_to_f128((*in_im_lo, *in_im_hi));
@@ -990,7 +990,7 @@ pub fn convert_add_backward_torus_scalar(
 ) {
    let norm = 1.0 / in_re0.len() as f64;
    for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
-        izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
+        izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
    {
        let in_re = f128(*in_re0 * norm, *in_re1 * norm);
        let in_im = f128(*in_im0 * norm, *in_im1 * norm);
@@ -1056,7 +1056,7 @@ pub fn convert_add_backward_torus_avx2(
            let in_im1 = pulp::as_arrays::<4, _>(in_im1).0;

            for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
-                izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
+                izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
            {
                let in_re = (
                    simd.mul_f64x4(pulp::cast(*in_re0), norm),
@@ -1149,7 +1149,7 @@ pub fn convert_add_backward_torus_avx512(
            let in_im1 = pulp::as_arrays::<8, _>(in_im1).0;

            for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
-                izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
+                izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
            {
                let in_re = (
                    simd.mul_f64x8(pulp::cast(*in_re0), norm),
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/bootstrap.rs
@@ -14,7 +14,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
    Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
 };
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
 use crate::core_crypto::fft_impl::fft64::math::fft::par_convert_polynomials_list_to_fourier;
@@ -72,7 +72,10 @@ impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {
    }

    /// Return an iterator over the GGSW ciphertexts composing the key.
-    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
+    pub fn into_ggsw_iter(
+        self,
+    ) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
+           + ExactSizeIterator<Item = FourierGgswCiphertext<C>>
    where
        C: Split,
    {
@@ -190,6 +193,9 @@ pub fn fill_with_forward_fourier_scratch(fft: FftView<'_>) -> Result<StackReq, S
 impl FourierLweBootstrapKeyMutView<'_> {
    /// Fill a bootstrapping key with the Fourier transform of a bootstrapping key in the standard
    /// domain.
+    ///
+    /// # Panics
+    /// This will panic if self and coeff_bsk are not of the same size
    pub fn fill_with_forward_fourier<Scalar: UnsignedTorus>(
        mut self,
        coef_bsk: LweBootstrapKey<&'_ [Scalar]>,
@@ -197,7 +203,7 @@ impl FourierLweBootstrapKeyMutView<'_> {
        stack: &mut PodStack,
    ) {
        for (fourier_ggsw, standard_ggsw) in
-            izip!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
+            izip_eq!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
        {
            fourier_ggsw.fill_with_forward_fourier(standard_ggsw, fft, stack);
        }
@@ -325,7 +331,8 @@ impl FourierLweBootstrapKeyView<'_> {
        let mut ct1 =
            GlweCiphertextMutView::from_container(&mut *ct1, lut_poly_size, ciphertext_modulus);

-        for (lwe_mask_element, bootstrap_key_ggsw) in izip!(msed_lwe_mask, self.into_ggsw_iter()) {
+        for (lwe_mask_element, bootstrap_key_ggsw) in izip_eq!(msed_lwe_mask, self.into_ggsw_iter())
+        {
            if lwe_mask_element != 0 {
                let monomial_degree = MonomialDegree(lwe_mask_element);

@@ -335,7 +342,7 @@ impl FourierLweBootstrapKeyView<'_> {

                // We rotate ct_1 and subtract ct_0 (first step of cmux) by performing
                // ct_1 <- (ct_0 * X^{a_hat}) - ct_0
-                for (mut ct1_poly, ct0_poly) in izip!(
+                for (mut ct1_poly, ct0_poly) in izip_eq!(
                    ct1.as_mut_polynomial_list().iter_mut(),
                    ct0.as_polynomial_list().iter(),
                ) {
@@ -386,7 +393,7 @@ impl FourierLweBootstrapKeyView<'_> {
        let ciphertext_modulus = lut_list.ciphertext_modulus();
        assert!(ciphertext_modulus.is_compatible_with_native_modulus());

-        for (mut lut, lwe) in izip!(lut_list.as_mut_view().iter_mut(), msed_lwe_list.iter()) {
+        for (mut lut, lwe) in izip_eq!(lut_list.as_mut_view().iter_mut(), msed_lwe_list.iter()) {
            let msed_lwe_body = lwe.body();

            let monomial_degree = MonomialDegree(msed_lwe_body.cast_into());
@@ -414,7 +421,7 @@ impl FourierLweBootstrapKeyView<'_> {
        );

        for (idx, bootstrap_key_ggsw) in self.into_ggsw_iter().enumerate() {
-            for (mut ct0, mut ct1, msed_lwe) in izip!(
+            for (mut ct0, mut ct1, msed_lwe) in izip_eq!(
                ct0_list.as_mut_view().iter_mut(),
                ct1_list.as_mut_view().iter_mut(),
                msed_lwe_list.iter()
@@ -431,7 +438,7 @@ impl FourierLweBootstrapKeyView<'_> {

                    // We rotate ct_1 and subtract ct_0 (first step of cmux) by performing
                    // ct_1 <- (ct_0 * X^{a_hat}) - ct_0
-                    for (mut ct1_poly, ct0_poly) in izip!(
+                    for (mut ct1_poly, ct0_poly) in izip_eq!(
                        ct1.as_mut_polynomial_list().iter_mut(),
                        ct0.as_polynomial_list().iter(),
                    ) {
@@ -551,7 +558,8 @@ impl FourierLweBootstrapKeyView<'_> {

        self.batch_blind_rotate_assign(local_accumulator.as_mut_view(), &lwe_in_msed, fft, stack);

-        for (mut lwe_out, local_accumulator) in izip!(lwe_out.iter_mut(), local_accumulator.iter())
+        for (mut lwe_out, local_accumulator) in
+            izip_eq!(lwe_out.iter_mut(), local_accumulator.iter())
        {
            extract_lwe_sample_from_glwe_ciphertext(
                &local_accumulator,
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
@@ -10,7 +10,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
    Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
 };
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::ggsw_ciphertext::{
    fourier_ggsw_level_matrix_size, GgswCiphertextView,
 };
@@ -154,7 +154,10 @@ impl<C: Container<Element = c64>> FourierGgswLevelMatrix<C> {
    }

    /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
+    pub fn into_rows(
+        self,
+    ) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
+           + ExactSizeIterator<Item = FourierGgswLevelRow<C>>
    where
        C: Split,
    {
@@ -262,7 +265,7 @@ impl FourierGgswCiphertextMutView<'_> {
        debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
        let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;

-        for (fourier_poly, coef_poly) in izip!(
+        for (fourier_poly, coef_poly) in izip_eq!(
            self.data().into_chunks(fourier_poly_size),
            coef_ggsw.as_polynomial_list().iter()
        ) {
@@ -401,7 +404,10 @@ impl<C: Container<Element = c64>> FourierGgswCiphertextList<C> {
        }
    }

-    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
+    pub fn into_ggsw_iter(
+        self,
+    ) -> impl ExactSizeIterator<Item = FourierGgswCiphertext<C>>
+           + DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
    where
        C: Split,
    {
@@ -548,7 +554,7 @@ pub fn add_external_product_assign<Scalar>(
            //
            //        t = 1                           t = 2                     ...

-            izip!(
+            izip_eq!(
                ggsw_decomp_matrix.into_rows(),
                glwe_decomp_term.as_polynomial_list().iter()
            )
@@ -586,7 +592,7 @@ pub fn add_external_product_assign<Scalar>(
    //
    // We iterate over the polynomials in the output.
    if !is_output_uninit {
-        izip!(
+        izip_eq!(
            out.as_mut_polynomial_list().iter_mut(),
            output_fft_buffer
                .into_chunks(fourier_poly_size)
@@ -649,26 +655,26 @@ pub(crate) fn update_with_fmadd(
                let rhs = S::as_simd_c64s(fourier).0;

                if is_output_uninit {
-                    for (output_fourier, ggsw_poly) in izip!(
+                    for (output_fourier, ggsw_poly) in izip_eq!(
                        output_fft_buffer.into_chunks(fourier_poly_size),
                        lhs_polynomial_list.into_chunks(fourier_poly_size)
                    ) {
                        let out = S::as_mut_simd_c64s(output_fourier).0;
                        let lhs = S::as_simd_c64s(ggsw_poly).0;

-                        for (out, lhs, rhs) in izip!(out, lhs, rhs) {
+                        for (out, lhs, rhs) in izip_eq!(out, lhs, rhs) {
                            *out = simd.mul_c64s(*lhs, *rhs);
                        }
                    }
                } else {
-                    for (output_fourier, ggsw_poly) in izip!(
+                    for (output_fourier, ggsw_poly) in izip_eq!(
                        output_fft_buffer.into_chunks(fourier_poly_size),
                        lhs_polynomial_list.into_chunks(fourier_poly_size)
                    ) {
                        let out = S::as_mut_simd_c64s(output_fourier).0;
                        let lhs = S::as_simd_c64s(ggsw_poly).0;

-                        for (out, lhs, rhs) in izip!(out, lhs, rhs) {
+                        for (out, lhs, rhs) in izip_eq!(out, lhs, rhs) {
                            *out = simd.mul_add_c64s(*lhs, *rhs, *out);
                        }
                    }
@@ -719,7 +725,7 @@ pub(crate) fn update_with_fmadd_factor(
        fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
            let factor = simd.splat_c64s(self.factor);

-            for (output_fourier, ggsw_poly) in izip!(
+            for (output_fourier, ggsw_poly) in izip_eq!(
                self.output_fft_buffer.into_chunks(self.fourier_poly_size),
                self.lhs_polynomial_list.into_chunks(self.fourier_poly_size)
            ) {
@@ -728,12 +734,12 @@ pub(crate) fn update_with_fmadd_factor(
                let rhs = S::as_simd_c64s(self.fourier).0;

                if self.is_output_uninit {
-                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
+                    for (out, &lhs, &rhs) in izip_eq!(out, lhs, rhs) {
                        // NOTE: factor * (lhs * rhs) is more efficient than (lhs * rhs) * factor
                        *out = simd.mul_c64s(factor, simd.mul_c64s(lhs, rhs));
                    }
                } else {
-                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
+                    for (out, &lhs, &rhs) in izip_eq!(out, lhs, rhs) {
                        // NOTE: see above
                        *out = simd.mul_add_c64s(factor, simd.mul_c64s(lhs, rhs), *out);
                    }
@@ -769,7 +775,7 @@ pub fn cmux<Scalar: UnsignedTorus>(
    fft: FftView<'_>,
    stack: &mut PodStack,
 ) {
-    izip!(ct1.as_mut(), ct0.as_ref()).for_each(|(c1, c0)| {
+    izip_eq!(ct1.as_mut(), ct0.as_ref()).for_each(|(c1, c0)| {
        *c1 = c1.wrapping_sub(*c0);
    });
    add_external_product_assign(ct0, ggsw, ct1.as_view(), fft, stack);
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
@@ -12,7 +12,7 @@ use crate::core_crypto::algorithms::*;
 use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use aligned_vec::CACHELINE_ALIGN;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -216,7 +216,7 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
        *out_pbs_body = (*out_pbs_body).wrapping_add(Scalar::ONE << (delta_log.0 + bit_idx - 1));

        // Remove the extracted bit from the initial LWE to get a 0 at the extracted bit location.
-        izip!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
+        izip_eq!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
            .for_each(|(out, inp)| *out = (*out).wrapping_sub(*inp));
    }
 }
@@ -521,7 +521,7 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
                break;
            };

-            let mut t_iter = izip!(t_0.iter_mut(), t_1.iter_mut()).enumerate();
+            let mut t_iter = izip_eq!(t_0.iter_mut(), t_1.iter_mut()).enumerate();

            let (mut j_counter, (mut t0_j, mut t1_j)) = t_iter.next().unwrap();

@@ -539,7 +539,7 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
                if t_fill[j] == 2 {
                    let (diff_data, stack) = stack.collect_aligned(
                        CACHELINE_ALIGN,
-                        izip!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
+                        izip_eq!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
                    );
                    let diff = GlweCiphertext::from_container(
                        &*diff_data,
@@ -709,7 +709,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
        pfpksk_list.ciphertext_modulus(),
    );

-    for (lwe_in, ggsw) in izip!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter()) {
+    for (lwe_in, ggsw) in izip_eq!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter()) {
        circuit_bootstrap_boolean(
            fourier_bsk,
            lwe_in,
@@ -728,7 +728,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn

    let small_lut_size = big_lut_as_polynomial_list.polynomial_count().0 / number_of_luts;

-    for (lut, lwe_out) in izip!(
+    for (lut, lwe_out) in izip_eq!(
        big_lut_as_polynomial_list.chunks_exact(small_lut_size),
        lwe_list_out.iter_mut(),
    ) {
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
@@ -521,7 +521,7 @@ pub fn test_cmux_tree() {
            level,
        );
        for (&single_bit_msg, mut fourier_ggsw) in
-            izip!(vec_message.iter(), ggsw_list.as_mut_view().into_ggsw_iter())
+            izip_eq!(vec_message.iter(), ggsw_list.as_mut_view().into_ggsw_iter())
        {
            let mut ggsw = GgswCiphertextOwned::new(
                0_u64,
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
@@ -6,7 +6,7 @@ use crate::core_crypto::commons::math::torus::UnsignedTorus;
 use crate::core_crypto::commons::numeric::CastInto;
 use crate::core_crypto::commons::parameters::{PolynomialCount, PolynomialSize};
 use crate::core_crypto::commons::traits::{Container, ContainerMut, IntoContainerOwned};
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::entities::*;
 use aligned_vec::{avec, ABox};
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -65,7 +65,7 @@ impl Twisties {
        let mut im = avec![0.0; n].into_boxed_slice();

        let unit = core::f64::consts::PI / (2.0 * n as f64);
-        for (i, (re, im)) in izip!(&mut *re, &mut *im).enumerate() {
+        for (i, (re, im)) in izip_eq!(&mut *re, &mut *im).enumerate() {
            (*im, *re) = (i as f64 * unit).sin_cos();
        }

@@ -212,7 +212,7 @@ fn convert_forward_torus<Scalar: UnsignedTorus>(
 ) {
    let normalization = 2.0_f64.powi(-(Scalar::BITS as i32));

-    izip!(out, in_re, in_im, twisties.re, twisties.im).for_each(
+    izip_eq!(out, in_re, in_im, twisties.re, twisties.im).for_each(
        |(out, in_re, in_im, w_re, w_im)| {
            let in_re: f64 = in_re.into_signed().cast_into() * normalization;
            let in_im: f64 = in_im.into_signed().cast_into() * normalization;
@@ -233,7 +233,7 @@ fn convert_forward_integer_scalar<Scalar: UnsignedTorus>(
    in_im: &[Scalar],
    twisties: TwistiesView<'_>,
 ) {
-    izip!(out, in_re, in_im, twisties.re, twisties.im).for_each(
+    izip_eq!(out, in_re, in_im, twisties.re, twisties.im).for_each(
        |(out, in_re, in_im, w_re, w_im)| {
            let in_re: f64 = in_re.into_signed().cast_into();
            let in_im: f64 = in_im.into_signed().cast_into();
@@ -278,7 +278,7 @@ fn convert_backward_torus<Scalar: UnsignedTorus>(
    twisties: TwistiesView<'_>,
 ) {
    let normalization = 1.0 / inp.len() as f64;
-    izip!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
+    izip_eq!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
        |(out_re, out_im, inp, w_re, w_im)| {
            let tmp = inp
                * (c64 {
@@ -299,7 +299,7 @@ fn convert_add_backward_torus_scalar<Scalar: UnsignedTorus>(
    twisties: TwistiesView<'_>,
 ) {
    let normalization = 1.0 / inp.len() as f64;
-    izip!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
+    izip_eq!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
        |(out_re, out_im, inp, w_re, w_im)| {
            let tmp = inp
                * (c64 {
@@ -781,7 +781,7 @@ pub fn par_convert_polynomials_list_to_fourier<Scalar: UnsignedTorus>(

            let stack = PodStack::new(&mut mem);

-            for (fourier_poly, standard_poly) in izip!(
+            for (fourier_poly, standard_poly) in izip_eq!(
                fourier_poly_chunk.chunks_exact_mut(f_polynomial_size),
                standard_poly_chunk.chunks_exact(polynomial_size.0)
            ) {
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
@@ -34,7 +34,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
        fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
        fft.backward_as_torus(roundtrip.as_mut_view(), fourier.as_view(), stack);

-        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
            if Scalar::BITS == 32 {
                assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
            } else {
@@ -48,7 +48,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
        fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
        fft.add_backward_as_torus(roundtrip.as_mut_view(), fourier.as_view(), stack);

-        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
            if Scalar::BITS == 32 {
                assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
            } else {
@@ -62,7 +62,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
        fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
        fft.add_backward_in_place_as_torus(roundtrip.as_mut_view(), fourier.as_mut_view(), stack);

-        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
            if Scalar::BITS == 32 {
                assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
            } else {
@@ -119,7 +119,7 @@ fn test_product<Scalar: UnsignedTorus>() {
            };

            let integer_magnitude = 16;
-            for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
+            for (x, y) in izip_eq!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
                *x = generator.random_uniform();
                *y = generator.random_uniform();
                *y >>= Scalar::BITS - integer_magnitude;
@@ -135,7 +135,7 @@ fn test_product<Scalar: UnsignedTorus>() {
            fft.forward_as_torus(fourier0.as_mut_view(), poly0.as_view(), stack);
            fft.forward_as_integer(fourier1.as_mut_view(), poly1.as_view(), stack);

-            for (f0, f1) in izip!(&mut *fourier0.data, &*fourier1.data) {
+            for (f0, f1) in izip_eq!(&mut *fourier0.data, &*fourier1.data) {
                *f0 *= *f1;
            }

@@ -152,7 +152,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                stack,
            );

-            for (expected, actual) in izip!(
+            for (expected, actual) in izip_eq!(
                convolution_from_naive.as_ref().iter(),
                convolution_from_fft.as_ref().iter()
            ) {
@@ -174,7 +174,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                stack,
            );

-            for (expected, actual) in izip!(
+            for (expected, actual) in izip_eq!(
                convolution_from_naive.as_ref().iter(),
                convolution_from_fft.as_ref().iter()
            ) {
@@ -198,7 +198,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                stack,
            );

-            for (expected, actual) in izip!(
+            for (expected, actual) in izip_eq!(
                convolution_from_naive.as_ref().iter(),
                convolution_from_fft.as_ref().iter()
            ) {
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
@@ -11,7 +11,7 @@

 use super::super::super::c64;
 use super::TwistiesView;
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -178,7 +178,7 @@ pub fn convert_forward_integer_u32_v4(
            let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<8, _>(twisties.im).0;

-            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
                let in_re = pulp::cast(in_re);
                let in_im = pulp::cast(in_im);
                let w_re = pulp::cast(w_re);
@@ -270,7 +270,7 @@ pub fn convert_forward_integer_u64_v4(
            let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<8, _>(twisties.im).0;

-            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
                let in_re = pulp::cast(in_re);
                let in_im = pulp::cast(in_im);
                let w_re = pulp::cast(w_re);
@@ -363,7 +363,7 @@ pub fn convert_forward_integer_u32_v3(
            let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<4, _>(twisties.im).0;

-            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
                let in_re = pulp::cast(in_re);
                let in_im = pulp::cast(in_im);
                let w_re = pulp::cast(w_re);
@@ -456,7 +456,7 @@ pub fn convert_forward_integer_u64_avx2_v3(
            let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<4, _>(twisties.im).0;

-            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
                let in_re = pulp::cast(in_re);
                let in_im = pulp::cast(in_im);
                let w_re = pulp::cast(w_re);
@@ -604,7 +604,7 @@ pub fn convert_add_backward_torus_u32_v4(
            let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<8, _>(twisties.im).0;

-            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
                let inp = pulp::cast::<_, [__m512d; 2]>(inp);
                let w_re = pulp::cast(w_re);
                let w_im = pulp::cast(w_im);
@@ -690,7 +690,7 @@ pub fn convert_add_backward_torus_u64_v4(
            let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<8, _>(twisties.im).0;

-            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
                let inp = pulp::cast::<_, [__m512d; 2]>(inp);
                let w_re = pulp::cast(w_re);
                let w_im = pulp::cast(w_im);
@@ -832,7 +832,7 @@ pub fn convert_add_backward_torus_u32_v3(
            let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<4, _>(twisties.im).0;

-            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
                let inp = pulp::cast::<_, [__m128d; 4]>(inp);
                let w_re = pulp::cast(w_re);
                let w_im = pulp::cast(w_im);
@@ -917,7 +917,7 @@ pub fn convert_add_backward_torus_u64_v3(
            let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
            let w_im = pulp::as_arrays::<4, _>(twisties.im).0;

-            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
                let inp = pulp::cast::<_, [__m128d; 4]>(inp);
                let w_re = pulp::cast(w_re);
                let w_im = pulp::cast(w_im);
--- a/tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::commons::test_tools::{modular_distance, new_random_generator};
-use crate::core_crypto::commons::utils::izip;
+use crate::core_crypto::commons::utils::izip_eq;
 use crate::core_crypto::gpu::vec::GpuIndex;
 use crate::core_crypto::gpu::{
    fourier_transform_backward_as_torus_f128_async, fourier_transform_forward_as_torus_f128_async,
@@ -53,7 +53,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
            cuda_synchronize_device(0);
        }

-        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
            if Scalar::BITS <= 64 {
                assert_eq!(*expected, *actual);
            } else {
--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::gpu::vec::{range_bounds_to_start_end, CudaVec};
-use crate::core_crypto::gpu::{CudaLweList, CudaStreams};
+use crate::core_crypto::gpu::{CudaLweList, CudaStreams, GpuIndex};
 use crate::core_crypto::prelude::{
    CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
    LweDimension, LweSize, UnsignedInteger,
@@ -213,6 +213,10 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
        Self(self.0.duplicate(streams))
    }

+    pub(crate) fn gpu_indexes(&self) -> &[GpuIndex] {
+        self.0.d_vec.gpu_indexes.as_slice()
+    }
+
    pub(crate) fn lwe_dimension(&self) -> LweDimension {
        self.0.lwe_dimension
    }
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -1212,6 +1212,11 @@ pub fn get_number_of_gpus() -> u32 {
    unsafe { cuda_get_number_of_gpus() as u32 }
 }

+/// Get the number of sms on the GPU
+pub fn get_number_of_sms() -> u32 {
+    unsafe { cuda_get_number_of_sms() as u32 }
+}
+
 /// Setup multi-GPU and return the number of GPUs used
 pub fn setup_multi_gpu(device_0_id: GpuIndex) -> u32 {
    unsafe { cuda_setup_multi_gpu(device_0_id.get()) as u32 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
tmontaigu	51372d9860	feat(hlapi): add flip operation	2025-09-05 16:09:47 +02:00
tmontaigu	dae7aff3b7	feat(integer): add flip operation Add the flip(condition: BooleanBlock, a: T, b: T) -> (T, T) operation that homomorphically flip/swap two values if the given encrypted boolean encrypts true	2025-09-05 16:09:47 +02:00
Nicolas Sarlin	adcf9bc1f3	fix(zk): handle limit cases in the four_squares algorithm	2025-09-05 15:34:44 +02:00
pgardratzama	0a1651adf3	fix(hpu): update firmware in bitstream to allow SIMD operations	2025-09-05 10:42:36 +02:00
pgardratzama	11b540c456	chore(hpu): adds cost of hpu setups	2025-09-05 10:42:36 +02:00
pgardratzama	bd7df4a03b	chore(hpu): enable hpu hlapi workflow and throughput bench in integer workflow	2025-09-05 10:42:36 +02:00
pgardratzama	2279d0deb8	chore(hpu): update hpu firmware (fix 2 bits operations issue)	2025-09-05 10:42:36 +02:00
pgardratzama	6fe24c6ab3	chore(hpu): update hpu integer bench scalar op names	2025-09-05 10:42:36 +02:00
pgardratzama	46c6adb0dc	feat(hpu): create a new workflow to launch HLAPI benches for HPU	2025-09-05 10:42:36 +02:00
pgardratzama	c6aa1adbe7	chore(hpu): update benches to run new operations	2025-09-05 10:42:36 +02:00
Helder Campos	d3a867ecfe	feat(hpu): High bandwidth HPU	2025-09-05 10:42:36 +02:00
Helder Campos	a83c92f28f	feat(hpu): Soft Reset Support and fix some runtime registers	2025-09-05 10:42:36 +02:00
Helder Campos	3b48ef301e	feat(hpu): Made two SIMD IOPs, ADD and ERC20.	2025-09-05 10:42:36 +02:00
Helder Campos	827a6e912c	feat(hpu): Adding a massively parallel multiplier operation	2025-09-05 10:42:36 +02:00
Guillermo Oyarzun	eeccace7b3	fix(gpu): add missing syncs when releasing scalar ops and returning to old lut release	2025-09-05 09:53:00 +02:00
dependabot[bot]	01d1fa96d7	chore(deps): bump on-headers and serve in /tfhe/web_wasm_parallel_tests Bumps [on-headers](https://github.com/jshttp/on-headers) to 1.1.0 and updates ancestor dependency [serve](https://github.com/vercel/serve). These dependencies need to be updated together. Updates `on-headers` from 1.0.2 to 1.1.0 - [Release notes](https://github.com/jshttp/on-headers/releases) - [Changelog](https://github.com/jshttp/on-headers/blob/master/HISTORY.md) - [Commits](https://github.com/jshttp/on-headers/compare/v1.0.2...v1.1.0) Updates `serve` from 14.2.3 to 14.2.5 - [Release notes](https://github.com/vercel/serve/releases) - [Commits](https://github.com/vercel/serve/compare/14.2.3...v14.2.5) --- updated-dependencies: - dependency-name: on-headers dependency-version: 1.1.0 dependency-type: indirect - dependency-name: serve dependency-version: 14.2.5 dependency-type: direct:development ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-05 09:14:55 +02:00
Arthur Meyre	10789ba3d1	chore(ci): configure tfhe-ntt tests to have an avx512 + IFMA instance - ubuntu-latest is replaced by m6i.4xlarge to make sure all code is tested in the tfhe-ntt crate	2025-09-05 09:14:12 +02:00
David Testé	4a0658389e	chore(bench): make bits to prove customizable in zk benchmarks Some application like blockchain, may wants to prove less bits than CRS size allows to.	2025-09-05 09:03:24 +02:00
David Testé	97574bdae8	chore(bench): add noise squash benchmark with compressions This new benchmark is extracted from a use case. From a compressed ciphertext, it measures the decompression, then noise squashes it and finally compresses again the result.	2025-09-04 15:13:08 +02:00
Guillermo Oyarzun	60d137de6e	feat(gpu): use mempools to optimize mem reuse	2025-09-04 13:23:18 +02:00
Guillermo Oyarzun	c2e816a86c	fix(gpu): change mininum number of elements in benches	2025-09-04 11:03:27 +02:00
Pedro Alves	b42ba79145	feat(gpu): implement support for 128-bit compression on the HL API	2025-09-03 14:33:08 -03:00
Agnes Leroy	69b055c03f	chore(gpu): update parameters for classical pbs128	2025-09-03 17:22:52 +02:00
Nicolas Sarlin	e2c7359057	chore(csprng): use getrandom as random source for unix seeder	2025-09-03 17:21:22 +02:00
Guillermo Oyarzun	baad6a6b49	feat(gpu): change broadcast lut to communicate the minimum possible	2025-09-03 15:20:58 +02:00
Guillermo Oyarzun	88c3df8331	feat(gpu): improve communication scheme	2025-09-03 15:20:58 +02:00
Nicolas Sarlin	e3686ed4ba	chore(fft): remove dead store in stockham dif16	2025-09-02 16:48:56 +02:00
Nicolas Sarlin	b8a9a15883	doc: explain how to run first example	2025-09-02 16:48:33 +02:00
Nicolas Sarlin	a7d931449a	doc(core): remove warning about glwe polynomial size of 1	2025-09-02 15:49:15 +02:00
Nicolas Sarlin	099bccd85f	chore(safe_ser): check serialization header version	2025-09-01 17:29:47 +02:00
Nicolas Sarlin	b9d75c9f8f	fix: remove references to 2^-64 pfail for GPU	2025-09-01 17:01:15 +02:00
Nicolas Sarlin	543517cea5	chore(core): use checked_mul for container indexing	2025-09-01 15:36:44 +02:00
Nicolas Sarlin	fed5c1db1e	fix(core): potential overflow for glwe encrypt on 32b platforms	2025-09-01 15:36:06 +02:00
Nicolas Sarlin	c9249fe991	chore(core): size checks in Fourier128GgswCiphertext::from_container	2025-09-01 15:35:58 +02:00
Nicolas Sarlin	d308305eb1	doc(core): add some "panics" comments	2025-09-01 15:35:41 +02:00
Nicolas Sarlin	f66730deb6	chore(core)!: add ExactSizeIterator to izip macro, renamed izip_eq	2025-09-01 15:35:41 +02:00
dependabot[bot]	cd92146c38	chore(deps): bump actions/cache from 4.2.0 to 4.2.4 Bumps [actions/cache](https://github.com/actions/cache) from 4.2.0 to 4.2.4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v4.2.0...0400d5f644dc74513175e3cd8d07132dd4860809) --- updated-dependencies: - dependency-name: actions/cache dependency-version: 4.2.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-01 11:26:00 +02:00
dependabot[bot]	568f77f5f6	chore(deps): bump actions/setup-node from 4.0.2 to 4.4.0 Bumps [actions/setup-node](https://github.com/actions/setup-node) from 4.0.2 to 4.4.0. - [Release notes](https://github.com/actions/setup-node/releases) - [Commits](`60edb5dd54...49933ea528`) --- updated-dependencies: - dependency-name: actions/setup-node dependency-version: 4.4.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-01 11:25:51 +02:00
dependabot[bot]	f610712e97	chore(deps): bump foundry-rs/foundry-toolchain from 1.3.1 to 1.4.0 Bumps [foundry-rs/foundry-toolchain](https://github.com/foundry-rs/foundry-toolchain) from 1.3.1 to 1.4.0. - [Release notes](https://github.com/foundry-rs/foundry-toolchain/releases) - [Changelog](https://github.com/foundry-rs/foundry-toolchain/blob/master/RELEASE.md) - [Commits](`de808b1eea...82dee4ba65`) --- updated-dependencies: - dependency-name: foundry-rs/foundry-toolchain dependency-version: 1.4.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-01 11:25:42 +02:00
dependabot[bot]	5d8f0b8532	chore(deps): bump actions/checkout from 4.1.7 to 5.0.0 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.7 to 5.0.0. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v4.1.7...08c6903cd8c0fde910a37f88322edcfb5dd907a8) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 5.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-01 11:25:35 +02:00
dependabot[bot]	11c04d0cc9	chore(deps): bump docker/login-action from 3.3.0 to 3.5.0 Bumps [docker/login-action](https://github.com/docker/login-action) from 3.3.0 to 3.5.0. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](`9780b0c442...184bdaa072`) --- updated-dependencies: - dependency-name: docker/login-action dependency-version: 3.5.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-09-01 11:25:27 +02:00
Pedro Alves	57ea3e3e88	chore(gpu): refactor the entry points for PBS in the backend	2025-08-29 16:46:27 -03:00
Pedro Alves	cad4070ebe	fix(gpu): fix the decompression function signature in the backend	2025-08-29 21:09:40 +02:00