Fix Makefile

Add cost for 8xL40
Add decomp_sns_comp bench to summary
2026-04-28 03:01:21 -04:00 · 2026-02-13 09:48:40 +01:00 · 2026-02-13 09:34:23 +01:00 · 2026-02-13 09:34:23 +01:00 · 2026-02-13 09:34:23 +01:00 · 2026-02-13 09:34:22 +01:00
7 changed files with 412 additions and 55 deletions
--- a/.github/workflows/benchmark_summary.yml
+++ b/.github/workflows/benchmark_summary.yml
@@ -0,0 +1,126 @@
+# Run all benchmarks displayed in the internal documentation.
+name: benchmark_summary
+
+run-name: Benchmark Summary
+
+on:
+  workflow_dispatch:
+    inputs:
+      run-cpu-benchmarks:
+        description: "Run CPU benchmarks"
+        type: boolean
+        default: true
+      run-gpu-benchmarks:
+        description: "Run GPU benchmarks"
+        type: boolean
+        default: true
+      gpu-profile:
+        description: "GPU Instance type"
+        required: true
+        default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100-SXM5x8)"
+      run-hpu-benchmarks:
+        description: "Run HPU benchmarks"
+        type: boolean
+        default: true
+
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  parse-gpu-inputs:
+    name: benchmark_summary/parse-gpu-inputs
+    if: inputs.run-gpu-benchmarks
+    runs-on: ubuntu-latest
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.gpu-profile }}
+    steps:
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks-cpu:
+    name: benchmark_documentation/run-benchmarks-cpu-integer
+    uses: ./.github/workflows/benchmark_cpu_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      command: summary
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-gpu:
+    name: benchmark_documentation/run-benchmarks-gpu
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-benchmarks
+    needs: parse-gpu-inputs
+    with:
+      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
+      command: summary
+      bench_type: both
+      params_type: classical + multi_bit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+# TODO add make recipe for HPU benchmarks
+#  run-benchmarks-hpu:
+#    name: benchmark_documentation/run-benchmarks-hpu
+#    uses: ./.github/workflows/benchmark_hpu_common.yml
+#    if: inputs.run-hpu-benchmarks
+#    with:
+#      command: summary
+#      bench_type: both
+#      v80_pcie_dev: 24
+#      v80_serial_number: XFL12NWY3ZKG
+#    secrets:
+#      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+#      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+#      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+#      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+#      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+#      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+#      SLAB_URL: ${{ secrets.SLAB_URL }}
+#      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+#      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
--- a/.github/workflows/placeholder_workflow.yml
+++ b/.github/workflows/placeholder_workflow.yml
@@ -1,18 +1,127 @@
 # Placeholder workflow file allowing running it without having to merge to main first
 name: placeholder_workflow

+run-name: Summary benchs tests
+
 on:
  workflow_dispatch:
+    inputs:
+      run-cpu-benchmarks:
+        description: "Run CPU benchmarks"
+        type: boolean
+        default: true
+      run-gpu-benchmarks:
+        description: "Run GPU benchmarks"
+        type: boolean
+        default: true
+      gpu-profile:
+        description: "GPU Instance type"
+        required: true
+        default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "8-l40 (n3-L40x8)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100-SXM5x8)"
+      run-hpu-benchmarks:
+        description: "Run HPU benchmarks"
+        type: boolean
+        default: true
+

 permissions: {}

 # zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow

 jobs:
-  placeholder:
-    name: placeholder_workflow/placeholder
+  parse-gpu-inputs:
+    name: benchmark_summary/parse-gpu-inputs
+    if: inputs.run-gpu-benchmarks
    runs-on: ubuntu-latest
-
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.gpu-profile }}
    steps:
-      - run: |
-          echo "Hello this is a Placeholder Workflow"
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks-cpu:
+    name: benchmark_documentation/run-benchmarks-cpu-integer
+    uses: ./.github/workflows/benchmark_cpu_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      command: summary
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-gpu:
+    name: benchmark_documentation/run-benchmarks-gpu
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-benchmarks
+    needs: parse-gpu-inputs
+    with:
+      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
+      command: summary
+      bench_type: both
+      params_type: classical + multi_bit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+# TODO add make recipe for HPU benchmarks
+#  run-benchmarks-hpu:
+#    name: benchmark_documentation/run-benchmarks-hpu
+#    uses: ./.github/workflows/benchmark_hpu_common.yml
+#    if: inputs.run-hpu-benchmarks
+#    with:
+#      command: summary
+#      bench_type: both
+#      v80_pcie_dev: 24
+#      v80_serial_number: XFL12NWY3ZKG
+#    secrets:
+#      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+#      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+#      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+#      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+#      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+#      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+#      SLAB_URL: ${{ secrets.SLAB_URL }}
+#      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+#      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
--- a/102
+++ b/102
@@ -1438,14 +1438,14 @@ bench_integer_hpu: install_rs_check_toolchain

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1459,6 +1459,7 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
@@ -1526,7 +1527,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1780,6 +1781,101 @@ bench_hlapi_kvstore: install_rs_check_toolchain
 	--bench hlapi-kvstore \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

+.PHONY: bench_summary # Run summary benchmarks
+bench_summary: install_rs_check_toolchain
+	# Arithmetic operations: addition, multiplication, division, comparison
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi_unsigned \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::add|::mul|::gt|::div_rem'
+
+	# Noise squash
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::noise_squash::'
+
+	# ERC20
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
+
+	# DEX
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::no_cmux::'
+
+	# ZK
+	# Proof is done on CPU node of the instance
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,zk-pok,pbs-stats \
+	-p tfhe-benchmark -- '::pke_zk_proof'
+	# Verify is done on GPUs
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
+
+	# Compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-glwe_packing_compression \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
+
+.PHONY: bench_summary_gpu # Run summary benchmarks on GPU
+bench_summary_gpu: install_rs_check_toolchain
+	# Arithmetic operations: addition, multiplication, division, comparison
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=FAST_DEFAULT __TFHE_RS_BENCH_BIT_SIZES_SET=FAST __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::add|::mul|::gt|::div_rem'
+
+	# Noise squash
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::noise_squash::'
+
+	# Noise squash and compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
+
+	# ERC20
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
+
+	# DEX
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE)  __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::no_cmux::'
+
+	# ZK
+	# Proof is done on CPU node of the instance
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,zk-pok,pbs-stats \
+	-p tfhe-benchmark -- '::pke_zk_proof'
+	# Verify is done on GPUs
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --
+
+	# Compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-glwe_packing_compression \
+	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_custom # Run benchmarks with a user-defined command
 bench_custom: install_rs_check_toolchain
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -12,6 +12,7 @@
  "n3-H100x4": 6.08,
  "n3-H100x2": 3.04,
  "n3-L40x1": 0.80,
+  "n3-L40x8": 6.40,
  "n3-H100-SXM5x8": 15.36,
  "hpu_x1": 1.0,
  "hpu_x2": 1.4,
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -106,3 +106,10 @@ environment_name = "norway"
 image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
 flavor_name = "n3-RTX-A4000x4"
 user = "ubuntu"
+
+[backend.hyperstack.8-l40]
+environment_name = "canada"
+image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
+flavor_name = "n3-L40x8"
+user = "ubuntu"
+
--- a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
@@ -1,6 +1,7 @@
 use benchmark::params_aliases::*;
 use benchmark::utilities::{
-    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, BitSizesSet, EnvConfig,
+    OperatorType,
 };
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use rayon::prelude::*;
@@ -8,8 +9,35 @@ use std::cmp::max;
 use tfhe::integer::ciphertext::CompressedCiphertextListBuilder;
 use tfhe::integer::{ClientKey, RadixCiphertext};
 use tfhe::keycache::NamedParam;
+use tfhe::shortint::parameters::LweCiphertextCount;
+use tfhe::shortint::MessageModulus;
 use tfhe::{get_pbs_count, reset_pbs_count};

+fn default_config(
+    lwe_per_glwe: &LweCiphertextCount,
+    message_modulus: &MessageModulus,
+) -> Vec<usize> {
+    let env_config = EnvConfig::new();
+
+    match env_config.bit_sizes_set {
+        BitSizesSet::Fast => {
+            vec![64]
+        }
+        _ => {
+            vec![
+                2,
+                8,
+                16,
+                32,
+                64,
+                128,
+                256,
+                lwe_per_glwe.0 * message_modulus.0.ilog2() as usize,
+            ]
+        }
+    }
+}
+
 fn cpu_glwe_packing(c: &mut Criterion) {
    let bench_name = "integer::packing_compression";
    let mut bench_group = c.benchmark_group(bench_name);
@@ -29,16 +57,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {

    let log_message_modulus = param.message_modulus.0.ilog2() as usize;

-    for bit_size in [
-        2,
-        8,
-        16,
-        32,
-        64,
-        128,
-        256,
-        comp_param.lwe_per_glwe().0 * log_message_modulus,
-    ] {
+    for bit_size in default_config(&comp_param.lwe_per_glwe(), &param.message_modulus) {
        assert_eq!(bit_size % log_message_modulus, 0);
        let num_blocks = bit_size / log_message_modulus;

@@ -159,17 +178,17 @@ fn cpu_glwe_packing(c: &mut Criterion) {
 mod cuda {
    use super::*;
    use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
+    use benchmark::utilities::{get_param_type, ParamType};
    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
    use tfhe::integer::compression_keys::CompressionPrivateKeys;
    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
    use tfhe::integer::gpu::gen_keys_radix_gpu;
    use tfhe::shortint::parameters::CompressionParameters;
-    use tfhe::shortint::PBSParameters;

    #[derive(Clone)]
    struct BenchConfig {
-        param: PBSParameters,
+        param: tfhe::shortint::AtomicPatternParameters,
        comp_param: CompressionParameters,
        bit_size: usize,
        cks: ClientKey,
@@ -289,7 +308,7 @@ mod cuda {

        write_to_json::<u64, _>(
            &bench_id_pack,
-            (comp_param, param.into()),
+            (comp_param, param),
            comp_param.name(),
            "pack",
            &OperatorType::Atomic,
@@ -452,7 +471,7 @@ mod cuda {

        write_to_json::<u64, _>(
            &bench_id_unpack,
-            (comp_param, param.into()),
+            (comp_param, param),
            comp_param.name(),
            "unpack",
            &OperatorType::Atomic,
@@ -464,64 +483,62 @@ mod cuda {
    }

    fn gpu_glwe_packing(c: &mut Criterion) {
-        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        let comp_param =
-            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-
-        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
+        let (param, comp_param): (
+            tfhe::shortint::AtomicPatternParameters,
+            CompressionParameters,
+        ) = match get_param_type() {
+            ParamType::Classical => (
+                BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            ),
+            _ => (
+                BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            ),
+        };

        let cks = ClientKey::new(param);
        let private_compression_key = cks.new_compression_private_key(comp_param);

        let mut config = BenchConfig {
-            param: tfhe::shortint::PBSParameters::MultiBitPBS(param),
+            param,
            comp_param,
            cks,
            private_compression_key,
            bit_size: 0,
        };
-        for bit_size in [
-            2,
-            8,
-            16,
-            32,
-            64,
-            128,
-            256,
-            comp_param.lwe_per_glwe().0 * log_message_modulus,
-        ] {
+        for bit_size in default_config(&comp_param.lwe_per_glwe(), &param.message_modulus()) {
            config.bit_size = bit_size;
            execute_gpu_glwe_packing(c, config.clone());
        }
    }

    fn gpu_glwe_unpacking(c: &mut Criterion) {
-        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        let comp_param =
-            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-
-        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
+        let (param, comp_param): (
+            tfhe::shortint::AtomicPatternParameters,
+            CompressionParameters,
+        ) = match get_param_type() {
+            ParamType::Classical => (
+                BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            ),
+            _ => (
+                BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            ),
+        };

        let cks = ClientKey::new(param);
        let private_compression_key = cks.new_compression_private_key(comp_param);

        let mut config = BenchConfig {
-            param: PBSParameters::MultiBitPBS(param),
+            param,
            comp_param,
            bit_size: 0,
            cks,
            private_compression_key,
        };
-        for bit_size in [
-            2,
-            8,
-            16,
-            32,
-            64,
-            128,
-            256,
-            comp_param.lwe_per_glwe().0 * log_message_modulus,
-        ] {
+        for bit_size in default_config(&comp_param.lwe_per_glwe(), &param.message_modulus()) {
            config.bit_size = bit_size;
            execute_gpu_glwe_unpacking(c, config.clone());
        }
--- a/tfhe/src/shortint/keycache.rs
+++ b/tfhe/src/shortint/keycache.rs
@@ -466,6 +466,7 @@ fn ks_params_default_name(params: &ShortintKeySwitchingParameters) -> String {

 named_params_impl!(CompressionParameters =>
    COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    V1_6_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    ; fallback => comp_params_default_name
 );
Author	SHA1	Message	Date
Agnes Leroy	b1e25c65a7	Fix Makefile	2026-02-13 09:48:40 +01:00
Agnes Leroy	cba4f2e6dd	Add cost for 8xL40	2026-02-13 09:34:23 +01:00
Agnes Leroy	6199610865	Add decomp_sns_comp bench to summary	2026-02-13 09:34:23 +01:00
Agnes Leroy	aae9e6c9a5	Add 8xL40 to slab config	2026-02-13 09:34:23 +01:00
Agnes Leroy	6ecf2a59e2	Fix Makefile	2026-02-13 09:34:22 +01:00
Agnes Leroy	3169ee8093	chore(gpu): bench classic or multi-bit params for compress/decompress	2026-02-12 11:45:44 +01:00
David Testé	8119c2287b	WIP: run classic an multi_bit benchs on gpu	2026-02-12 11:42:48 +01:00
David Testé	e4c7f83e17	WIP: use placeholder workflow to run benches	2026-02-10 16:09:26 +01:00
David Testé	349846bc11	WIP: add profile selection for GPU bench	2026-02-10 16:09:25 +01:00
David Testé	b1fb4b2ae0	WIP: adding missing DEX bench	2026-02-10 16:09:25 +01:00
David Testé	da3c55c50b	WIP: implement bench workflow testing cpu and gpu	2026-02-10 16:09:22 +01:00