feat(gpu): expose ILP in standard keyswitch

2026-04-28 03:01:21 -04:00 · 2026-04-15 13:19:33 +02:00
25 changed files with 117 additions and 415 deletions
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -38,7 +38,6 @@ on:
          - integer_aes
          - integer_aes256
          - hlapi_erc7984
-          - hlapi_erc7984_multi_group
          - hlapi_dex
          - hlapi_noise_squash
      op_flavor:
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -237,10 +237,8 @@ jobs:
          BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
          BENCH_COMMAND: ${{ matrix.command }}
          PRECISIONS_SET: ${{ inputs.precisions_set }}
-          __TFHE_RS_BENCH_MULTI_PROC_GROUPS: 2

      - name: Parse results
-        if: ${{ inputs.command != 'hlapi_erc7984_multi_group' }}
        run: |
          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
@@ -258,39 +256,6 @@ jobs:
          REF_NAME: ${{ github.ref_name }}
          BENCH_TYPE: ${{ matrix.bench_type }}

-      - name: Parse and merge erc7984_multi_group results
-        if: ${{ inputs.command == 'hlapi_erc7984_multi_group' }}
-        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p0/criterion "${RESULTS_FILENAME_P0}" \
-          --database tfhe_rs \
-          --hardware "${INPUTS_HARDWARE_NAME}" \
-          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --bench-type "${BENCH_TYPE}"
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p1/criterion "${RESULTS_FILENAME_P1}" \
-          --database tfhe_rs \
-          --hardware "${INPUTS_HARDWARE_NAME}" \
-          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --bench-type "${BENCH_TYPE}"
-          python3 ./ci/merge_multi_group_results.py --bench-type "${BENCH_TYPE}" --output "${RESULTS_FILENAME}" "${RESULTS_FILENAME_P0}" "${RESULTS_FILENAME_P1}"
-        env:
-          INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
-          REF_NAME: ${{ github.ref_name }}
-          BENCH_TYPE: ${{ matrix.bench_type }}
-          RESULTS_FILENAME_P0: parsed_benchmark_results_p0_${{ github.sha }}.json
-          RESULTS_FILENAME_P1: parsed_benchmark_results_p1_${{ github.sha }}.json
-
      - name: Parse additional benchmarks results files
        if: ${{ inputs.additional_file_to_parse }}
        run: |
--- a/.github/workflows/benchmark_summary.yml
+++ b/.github/workflows/benchmark_summary.yml
@@ -114,27 +114,6 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-gpu-erc7984-multi-group:
-    name: benchmark_documentation/run-benchmarks-gpu-erc7984-multi-group
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-benchmarks
-    needs: parse-gpu-inputs
-    with:
-      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
-      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
-      command: hlapi_erc7984_multi_group
-      bench_type: throughput
-      params_type: multi_bit
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
 # TODO add make recipe for HPU benchmarks
 #  run-benchmarks-hpu:
 #    name: benchmark_documentation/run-benchmarks-hpu
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,16 +62,15 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -66,15 +65,13 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -17,8 +17,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Weekly tests will be triggered every Monday at 8p.m.
-    - cron: "0 20 * * 1"
+    # Nightly tests will be triggered each evening 8p.m.
+    - cron: "0 20 * * *"
  pull_request:


@@ -28,41 +28,10 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
-  should-run:
-    name: gpu_integer_long_run_tests/should-run
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read  # Needed to check for file change
-    outputs:
-      is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
-        with:
-          files_yaml: |
-            gpu:
-              - tfhe/Cargo.toml
-              - tfhe/build.rs
-              - backends/tfhe-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
-              - tfhe/src/shortint/parameters/**
-              - '.github/workflows/gpu_integer_long_run_tests.yml'
-
  setup-instance:
    name: gpu_integer_long_run_tests/setup-instance
-    needs: [should-run]
-    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -131,10 +131,6 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          make semgrep_and_lint_gpu_code
-
      - name: Run fmt checks
        run: |
          make check_fmt_gpu
@@ -143,6 +139,10 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          make semgrep_and_lint_gpu_code
+
      - name: Run semver checks on tfhe-cuda-backend
        run: |
          make semver_check_cuda_backend
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -63,6 +63,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,17 +63,16 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -64,6 +64,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -63,6 +63,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled, opened, synchronize ]
+    types: [ labeled ]

 permissions:
  contents: read
@@ -38,7 +38,6 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
-      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,17 +63,16 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
-            core_crypto:
-              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -64,6 +64,7 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -55,9 +55,12 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/zk/**
              - tfhe-zk-pok/**
+              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

--- a/47
+++ b/47
@@ -312,10 +312,8 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
 	find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
 		| grep -v '/cmake-build-debug/' \
 		| grep -v '/build/' \
-		| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
+		| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
 	venv/bin/python3 "scripts/check_scratch_cleanup.py"
-	@# Split the search string using shell string concatenation so the Makefile line doesn't match itself
-	! git ls-files | xargs grep -n 'TODO: ADD COMM''ENT'

 .PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
 semver_check_cuda_backend:
@@ -1957,40 +1955,6 @@ bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
 	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_hlapi_erc7984_multi_group_gpu # Runs ERC7984 bench in two processes (half of gpus for each) and aggregates results
-bench_hlapi_erc7984_multi_group_gpu: install_rs_check_toolchain
-	# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
-	# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
-	rm -f /dev/shm/sem.tfhe_bench_*
-	NUM_GROUPS=$${__TFHE_RS_BENCH_MULTI_PROC_GROUPS:-2}; \
-	[ "$$NUM_GROUPS" -ge 2 ] || { echo "Error: __TFHE_RS_BENCH_MULTI_PROC_GROUPS must be at least 2, got $$NUM_GROUPS"; exit 1; }; \
-	trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
-	for i in $$(seq 0 $$((NUM_GROUPS - 1))); do \
-		GPU_LIST=$$(python3 ci/split_gpus.py $$i $$NUM_GROUPS) || exit 1; \
-		echo "Starting benchmark group $$i with CUDA_VISIBLE_DEVICES=$$GPU_LIST"; \
-		CUDA_VISIBLE_DEVICES=$$GPU_LIST CARGO_TARGET_DIR=target_p$$i RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=$$NUM_GROUPS \
-		cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-		--bench hlapi-erc7984 \
-		--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
-	done; \
-	wait
-
-.PHONY: bench_hlapi_erc7984_multi_group_fake_multi_gpu # Runs ERC7984 bench in two processes in parallel on a single GPU (use to debug bench_hlapi_erc7984_multi_group_gpu)
-bench_hlapi_erc7984_multi_group_fake_multi_gpu: install_rs_check_toolchain
-	# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
-	# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
-	rm -f /dev/shm/sem.tfhe_bench_*
-	trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
-	CARGO_TARGET_DIR=target_p0 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
-	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
-	CARGO_TARGET_DIR=target_p1 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc7984 \
-	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
-	wait
-
 .PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
 bench_hlapi_dex: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -2119,16 +2083,11 @@ bench_summary_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'

-	# This make target only runs the latency benchmark. This is because
-	# summary benchmarks must use the multi-process-multi-group throughput target
-	# to measure throughput. That target must be followed by specific post-processing steps.
-	# Thus that target is run in a separate step in benchmark_summary.yml.
-ifneq ($(filter latency both,$(BENCH_TYPE)),)
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=latency __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	# ERC7984
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc7984 \
 	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
-endif

 	# DEX
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE)  __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -301,7 +301,11 @@ __global__ void keyswitch_zero_output_with_output_indices(
 // in two parts, a constant part is calculated before the loop, and a variable
 // part is calculated inside the loop. This seems to help with the register
 // pressure as well.
-template <typename Torus, typename KSTorus>
+// LevelCount template fully unrolls the level loop.
+// Accumulation step is divided in even and odd to expose more instruction level
+// parallelism. When LevelCount == 0 (default), the original runtime path is
+// used unchanged.
+template <typename Torus, typename KSTorus, int LevelCount = 0>
 __global__ void
 keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
          const Torus *__restrict__ lwe_array_in,
@@ -318,13 +322,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,

  if (tid <= lwe_dimension_out) {

-    KSTorus local_lwe_out = 0;
+    // Accumulator is splited into even and odd iterations.
+    KSTorus local_lwe_out_even = 0;
+    KSTorus local_lwe_out_odd = 0;
    auto block_lwe_array_in = get_chunk(
        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);

    if (tid == lwe_dimension_out && threadIdx.y == 0) {
      if constexpr (std::is_same_v<KSTorus, Torus>) {
-        local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
+        local_lwe_out_even = -block_lwe_array_in[lwe_dimension_in];
      } else {
        auto new_body = closest_repr(block_lwe_array_in[lwe_dimension_in],
                                     sizeof(KSTorus) * 8, 1);
@@ -337,7 +343,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        auto rounded_downscaled_body =
            (KSTorus)(new_body >> input_to_output_scaling_factor);

-        local_lwe_out = -rounded_downscaled_body;
+        local_lwe_out_even = -rounded_downscaled_body;
      }
    }
    const Torus mask_mod_b = (1ll << base_log) - 1ll;
@@ -351,18 +357,41 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    for (int i = start_i; i < end_i; i++) {
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
-      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
-#pragma unroll 1
-      for (int j = 0; j < level_count; j++) {

-        KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-        local_lwe_out +=
-            (KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
-            decomposed;
+      // Fully unrolled level loop in case LevelCount is precompiled.
+      if constexpr (LevelCount > 0) {
+        uint32_t offset = i * LevelCount * (lwe_dimension_out + 1);
+#pragma unroll
+        for (int j = 0; j < LevelCount; j++) {
+          KSTorus decomposed =
+              decompose_one<Torus>(state, mask_mod_b, base_log);
+          if (j & 1)
+            local_lwe_out_odd +=
+                (KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
+                decomposed;
+          else
+            local_lwe_out_even +=
+                (KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
+                decomposed;
+        }
+      } else {
+        // Runtime fallback: original behaviour preserved exactly.
+        uint32_t offset = i * level_count * (lwe_dimension_out + 1);
+#pragma unroll 1
+        for (int j = 0; j < level_count; j++) {
+          KSTorus decomposed =
+              decompose_one<Torus>(state, mask_mod_b, base_log);
+          local_lwe_out_even +=
+              (KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
+              decomposed;
+        }
      }
    }

-    lwe_acc_out[shmem_index] = local_lwe_out;
+    if constexpr (LevelCount > 0)
+      lwe_acc_out[shmem_index] = local_lwe_out_even + local_lwe_out_odd;
+    else
+      lwe_acc_out[shmem_index] = local_lwe_out_even;
  }

  for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
@@ -405,9 +434,46 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
  dim3 grid(num_samples, num_blocks_per_sample, 1);
  dim3 threads(num_threads_x, num_threads_y, 1);

-  keyswitch<Torus, KSTorus><<<grid, threads, shared_mem, stream>>>(
-      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count);
+  // Dispatch to a statically-specialised kernel for common level_count values
+  // so the level loop is fully unrolled.
+#define KS_LAUNCH(N)                                                           \
+  keyswitch<Torus, KSTorus, N><<<grid, threads, shared_mem, stream>>>(         \
+      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk, \
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count)
+
+  switch (level_count) {
+  case 1:
+    KS_LAUNCH(1);
+    break;
+  case 2:
+    KS_LAUNCH(2);
+    break;
+  case 3:
+    KS_LAUNCH(3);
+    break;
+  case 4:
+    KS_LAUNCH(4);
+    break;
+  case 5:
+    KS_LAUNCH(5);
+    break;
+  case 6:
+    KS_LAUNCH(6);
+    break;
+  case 7:
+    KS_LAUNCH(7);
+    break;
+  case 8:
+    KS_LAUNCH(8);
+    break;
+  case 9:
+    KS_LAUNCH(9);
+    break;
+  default:
+    KS_LAUNCH(0);
+    break;
+  }
+#undef KS_LAUNCH
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -489,7 +489,7 @@ template <typename Torus>
 __host__ void host_modulus_switch_multi_bit(
    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cudaSetDevice(gpu_index);
  int multibit_size = size / grouping_factor;
  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -308,7 +308,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
  cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
                                                    pbs_buffer);
-  cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
 }

 // Noise tests variant of the 128-bit multi-bit PBS, restricted to
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -264,7 +264,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
  scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
      stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
      lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
-      input_lwe_ciphertext_count, true, false);
+      input_lwe_ciphertext_count, true, PBS_MS_REDUCTION_T::NO_REDUCTION);
  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;
  for (auto _ : st) {
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -6,7 +6,6 @@
  "p4d.24xlarge": 32.7726,
  "p5.48xlarge": 98.32,
  "rtx4090": 0.04,
-  "n3-L40x4": 3.2,
  "n3-H100x1": 1.52,
  "n3-H100x8-NVLink": 12.48,
  "n3-H100x8": 12.16,
--- a/ci/merge_multi_group_results.py
+++ b/ci/merge_multi_group_results.py
@@ -1,66 +0,0 @@
-#!/usr/bin/env python3
-# This script aggregates multi-process-group benchmark results
-# that are obtained by running benchmarks in a multi-process approach
-import argparse
-import json
-import sys
-
-ACCEPTED_TEST_PREFIXES = {
-    "throughput": ["hlapi::cuda::erc7984::throughput"],
-    "latency": ["hlapi::cuda::erc7984::latency"],
-}
-
-
-# Looks at the Slab JSON benchmark results and aggregates the "value" field.
-# For throughput, values are summed across groups.
-# For latency, values are averaged across groups.
-def merge_multi_group_results(input_files, output_file, bench_type):
-    accumulated = {}
-    counts = {}
-    metadata = None
-    accepted_prefixes = ACCEPTED_TEST_PREFIXES[bench_type]
-
-    for path in input_files:
-        with open(path) as f:
-            data = json.load(f)
-        if metadata is None:
-            metadata = {k: v for k, v in data.items() if k != "points"}
-        for point in data["points"]:
-            test = point["test"]
-            if not any(test.startswith(prefix) for prefix in accepted_prefixes):
-                print(
-                    f"Error: unexpected test '{test}' in {path}: "
-                    f"this script only supports aggregation of: {accepted_prefixes}",
-                    file=sys.stderr,
-                )
-                sys.exit(1)
-            if test in accumulated:
-                accumulated[test]["value"] += point["value"]
-                counts[test] += 1
-            else:
-                accumulated[test] = dict(point)
-                counts[test] = 1
-
-    if bench_type == "latency":
-        for test in accumulated:
-            accumulated[test]["value"] /= counts[test]
-
-    result = dict(metadata)
-    result["points"] = list(accumulated.values())
-
-    with open(output_file, "w") as f:
-        json.dump(result, f, indent=2)
-
-
-# The output is a positional argument, for file names we accept 2+
-parser = argparse.ArgumentParser()
-parser.add_argument("input_files", nargs="+")
-parser.add_argument("--output", required=True)
-parser.add_argument("--bench-type", required=True, choices=["throughput", "latency"])
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    if len(args.input_files) < 2:
-        print("Error: at least 2 input files required", file=sys.stderr)
-        sys.exit(1)
-    merge_multi_group_results(args.input_files, args.output, args.bench_type)
--- a/ci/split_gpus.py
+++ b/ci/split_gpus.py
@@ -1,55 +0,0 @@
-import argparse
-import subprocess
-import sys
-
-
-# List the gpus for a sub-group (group_index) of gpus grouped
-# in num_groups groups. The output string is passed to CUDA_VISIBLE_DEVICES
-def get_gpu_count() -> int:
-    try:
-        result = subprocess.run(
-            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-    except FileNotFoundError:
-        print("Error: nvidia-smi not found", file=sys.stderr)
-        sys.exit(1)
-    except subprocess.CalledProcessError as err:
-        print(f"Error: nvidia-smi failed: {err.stderr.strip()}", file=sys.stderr)
-        sys.exit(1)
-    return len(result.stdout.strip().splitlines())
-
-
-def gpu_list_for_group(num_gpus: int, group_index: int, num_groups: int) -> str:
-    # Splits the available gpus un groups and returns
-    # the gpus assigned to group group_index.
-    if num_gpus < num_groups:
-        print(
-            f"Error: cannot split {num_gpus} GPU(s) across {num_groups} group(s): "
-            "not enough GPUs",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    if num_gpus % num_groups != 0:
-        print(
-            f"Error: {num_gpus} GPU(s) is not evenly divisible by {num_groups} group(s)",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    gpus_per_group = num_gpus // num_groups
-    start = group_index * gpus_per_group
-    return ",".join(str(i) for i in range(start, start + gpus_per_group))
-
-
-parser = argparse.ArgumentParser(
-    description="Print the CUDA_VISIBLE_DEVICES value for one process in a multi-GPU split."
-)
-parser.add_argument("group_index", type=int, help="0-based index of this process group")
-parser.add_argument("num_groups", type=int, help="Total number of process groups")
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    num_gpus = get_gpu_count()
-    print(gpu_list_for_group(num_gpus, args.group_index, args.num_groups))
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -14,9 +14,6 @@ publish = false
 name = "benchmark"
 path = "src/lib.rs"

-[target.'cfg(unix)'.dependencies]
-libc = "0.2"
-
 [dependencies]
 bincode = { workspace = true }
 # clap has to be pinned as its minimum supported rust version
--- a/tfhe-benchmark/benches/high_level_api/erc7984.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc7984.rs
@@ -1,7 +1,5 @@
 #[cfg(feature = "gpu")]
-use benchmark::utilities::{
-    bench_sync_barrier, configure_gpu, get_bench_gpu_instances, get_param_type, ParamType,
-};
+use benchmark::utilities::{configure_gpu, get_param_type, ParamType};
 use benchmark::utilities::{write_to_json_unchecked, OperatorType};
 use benchmark_spec::{get_bench_type, BenchmarkType};
 use criterion::measurement::WallTime;
@@ -562,11 +560,6 @@ fn cuda_bench_transfer_throughput<FheType, F>(
        let num_streams_per_gpu = 6; // Hard coded stream value for FheUint64
        let chunk_size = (num_elems / num_gpus) as usize;

-        #[cfg(target_os = "linux")]
-        if let Some(n) = get_bench_gpu_instances() {
-            bench_sync_barrier(n);
-        }
-
        b.iter(|| {
            from_amounts
                .par_chunks(chunk_size) // Split into chunks of num_gpus
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -472,106 +472,6 @@ pub fn get_param_type() -> &'static ParamType {
    PARAM_TYPE.get_or_init(|| ParamType::from_env().unwrap())
 }

-pub fn get_bench_gpu_instances() -> Option<usize> {
-    env::var("__TFHE_RS_BENCH_GPU_PROCESS_COUNT").ok().map(|v| {
-        v.parse::<usize>().unwrap_or_else(|_| {
-            panic!("__TFHE_RS_BENCH_GPU_PROCESS_COUNT must be a positive integer, got '{v}'")
-        })
-    })
-}
-
-/// Multi-process barrier that ensures num_instances processes
-/// start at the same time
-#[cfg(target_os = "linux")]
-pub fn bench_sync_barrier(num_instances: usize) {
-    use std::ffi::CString;
-    use std::time::{Duration, SystemTime, UNIX_EPOCH};
-
-    const BARRIER_TIMEOUT_SECS: u64 = 120;
-    const MUTEX_NAME_PREFIX: &str = "tfhe_bench";
-
-    // Three POSIX semaphores are used for synchronization
-    // The first one is used to make sure the processes increment the
-    // counter and get the value of the counter atomically .
-    let sem_mutex = CString::new(format!("/{MUTEX_NAME_PREFIX}_mutex")).unwrap();
-    let sem_arrive = CString::new(format!("/{MUTEX_NAME_PREFIX}_arrive")).unwrap();
-    let sem_gate = CString::new(format!("/{MUTEX_NAME_PREFIX}_gate")).unwrap();
-
-    let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
-    let deadline_t = now + Duration::from_secs(BARRIER_TIMEOUT_SECS);
-    let deadline = libc::timespec {
-        tv_sec: deadline_t.as_secs() as libc::time_t,
-        tv_nsec: deadline_t.subsec_nanos() as libc::c_long,
-    };
-
-    let open_sem = |name: &CString, init: u32| {
-        let sem = unsafe { libc::sem_open(name.as_ptr(), libc::O_CREAT, 0o600u32, init) };
-        assert!(
-            sem != libc::SEM_FAILED,
-            "sem_open({:?}) failed: {}",
-            name,
-            std::io::Error::last_os_error()
-        );
-        sem
-    };
-
-    let timed_wait = |sem: *mut libc::sem_t, label: &str| {
-        let ret = unsafe { libc::sem_timedwait(sem, &deadline) };
-        if ret != 0 {
-            panic!(
-                "bench_sync_barrier: timed out on '{label}' after {BARRIER_TIMEOUT_SECS}s \
-                 (__TFHE_RS_BENCH_GPU_PROCESS_COUNT={num_instances}). \
-                 If semaphores are stale from a prior crash, clean up with: \
-                 rm -f /dev/shm/sem.{MUTEX_NAME_PREFIX}_*\n\
-                 OS error: {}",
-                std::io::Error::last_os_error()
-            );
-        }
-    };
-
-    let mutex = open_sem(&sem_mutex, 1);
-    let arrive = open_sem(&sem_arrive, 0);
-    let gate = open_sem(&sem_gate, 0);
-
-    // Process 0 to arrive doesn't need to wait
-    // Processes 1..N to arrive need to wait
-    timed_wait(mutex, "mutex");
-    // Each process posts to the arrive semaphore, incrementing its value
-    unsafe { libc::sem_post(arrive) };
-    // The last process to post to "arrive" will read a value equal to "num_instances"
-    // The other processes read a lower value. "mutex" ensures
-    // the post + get_value are atomic
-    let mut count = 0i32;
-    unsafe { libc::sem_getvalue(arrive, &mut count) };
-
-    // Once a process has posted to arrive and got the value (atomic)
-    // it allows the other processes to do the same
-    unsafe { libc::sem_post(mutex) };
-
-    // The last process reads the "num_instances" value from arrive.
-    // it must then tell the others to continue work. if it doesn't
-    // the other processes will time out at the "gate"
-    if count as usize == num_instances {
-        for _ in 0..num_instances {
-            // Open the gate
-            unsafe { libc::sem_post(gate) };
-        }
-    }
-
-    // Every process waits at the gate. If it doesn't open in a certain time, then we panic
-    timed_wait(gate, "gate");
-
-    // Clean up
-    unsafe {
-        libc::sem_close(mutex);
-        libc::sem_close(arrive);
-        libc::sem_close(gate);
-        libc::sem_unlink(sem_mutex.as_ptr());
-        libc::sem_unlink(sem_arrive.as_ptr());
-        libc::sem_unlink(sem_gate.as_ptr());
-    }
-}
-
 /// Generate a number of threads to use to saturate current machine for throughput measurements.
 pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
    let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set