feat(hpu): move to new bitstream at 400Mhz with GRAM_NB 3

- update SIMD_N and min_batch_size to 12 which seems to give better latency and ERC20 throughput - support IOp on several lines in ami /proc file - reduce amount of ERC_20_SIMD per batch in HLAPI bench
2026-01-07 22:04:10 -05:00 · 2025-09-29 13:55:10 +02:00
parent da223b36b6
commit 39b81a8ded
9 changed files with 15 additions and 15 deletions
--- a/.github/workflows/benchmark_hpu_hlapi.yml
+++ b/.github/workflows/benchmark_hpu_hlapi.yml
@@ -16,7 +16,7 @@ permissions: {}
 jobs:
  hlapi-benchmarks-hpu:
    name: Execute HLAPI benchmarks for HPU backend
-    runs-on: v80-desktop
+    runs-on: v80-marais
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -63,8 +63,8 @@ jobs:
      - name: Run benchmarks
        run: |
          make pull_hpu_files
-          export V80_SERIAL_NUMBER=XFL12E4XJXWK
+          export V80_SERIAL_NUMBER=XFL12NWY3ZKG
-          source /opt/xilinx/Vivado/2024.2/settings64.sh
+          source /opt/amd/Vivado/2024.2/settings64.sh
          make bench_hlapi_erc20_hpu
          make bench_hlapi_hpu
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -29,7 +29,7 @@ permissions: {}
 jobs:
  prepare-matrix:
    name: Prepare operations matrix
-    runs-on: v80-desktop
+    runs-on: v80-marais
    outputs:
      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
    steps:
@@ -58,7 +58,7 @@ jobs:
  integer-benchmarks-hpu:
    name: benchmark_hpu_integer/integer-benchmarks-hpu
    needs: prepare-matrix
-    runs-on: v80-desktop
+    runs-on: v80-marais
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -114,8 +114,8 @@ jobs:
      - name: Run benchmarks
        run: |
          make pull_hpu_files
-          export V80_SERIAL_NUMBER=XFL12E4XJXWK
+          export V80_SERIAL_NUMBER=XFL12NWY3ZKG
-          source /opt/xilinx/Vivado/2024.2/settings64.sh
+          source /opt/amd/Vivado/2024.2/settings64.sh
          make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
        env:
          BENCH_TYPE: ${{ matrix.bench_type }}
--- a/2
+++ b/2
@@ -1332,7 +1332,7 @@ bench_integer_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -78,7 +78,7 @@
  #implementation = "Ilp"
  implementation = "Llt"
  integer_w=[2,4,6,8,10,12,14,16,32,64,128]
-  min_batch_size = 9
+  min_batch_size = 12
  kogge_cfg            = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
  custom_iop.'IOP[0]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
  custom_iop.'IOP[1]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
--- a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
+++ b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
+oid sha256:3eb7619c9fb31dde691f08f963968ed60ec2892f3fab16b9c6c2353f9605efe8
-size 84230351
+size 82307506
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -176,7 +176,7 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
    imm: 1,
 };
-pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
+pub const SIMD_N: usize = 12; //TODO: We need to come up with a way to have this dynamic
 pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
    dst: [VarMode::Native; SIMD_N],
    src: [VarMode::Native; 2 * SIMD_N],
--- a/backends/tfhe-hpu-backend/src/ffi/v80/ami.rs
+++ b/backends/tfhe-hpu-backend/src/ffi/v80/ami.rs
@@ -377,7 +377,7 @@ impl AmiDriver {
        if ack_str.is_empty() {
            0
        } else {
-            let ack_nb = ack_str.as_str().trim_ascii().parse::<u32>().unwrap();
+            let ack_nb = ack_str.as_str().lines().map(|line| line.trim_ascii().parse::<u32>().unwrap()).sum();
            tracing::trace!("Get value {ack_str} from {ami_proc_path} => {ack_nb}",);
            ack_nb
        }
--- a/setup_hpu.sh
+++ b/setup_hpu.sh
@@ -22,7 +22,7 @@ V80_PCIE_DEV="unselected"
 XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
 # V80 bitstream refresh require insmod of ami.ko module
-AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}
+AMI_PATH=${AMI_PATH:-"/opt/v80/ami/e55d02d"}
 # Parse user CLI ##############################################################
 opt_short="hc:l:p:"
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -663,7 +663,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
        .len()
        / 3;
    let mut rng = thread_rng();
-    for num_elems in [2, 10] {
+    for num_elems in [2, 8] {
        let real_num_elems = num_elems * (hpu_simd_n as u64);
        group.throughput(Throughput::Elements(real_num_elems));
        let bench_id =