feat(hpu): move to new bitstream at 400Mhz with GRAM_NB 3

- update SIMD_N and min_batch_size to 12 which seems to give better
  latency and ERC20 throughput
- support IOp on several lines in ami /proc file
- reduce amount of ERC_20_SIMD per batch in HLAPI bench
This commit is contained in:
pgardratzama
2025-09-29 13:55:10 +02:00
committed by Pierre Gardrat
parent da223b36b6
commit 39b81a8ded
9 changed files with 15 additions and 15 deletions

View File

@@ -16,7 +16,7 @@ permissions: {}
jobs:
hlapi-benchmarks-hpu:
name: Execute HLAPI benchmarks for HPU backend
runs-on: v80-desktop
runs-on: v80-marais
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -63,8 +63,8 @@ jobs:
- name: Run benchmarks
run: |
make pull_hpu_files
export V80_SERIAL_NUMBER=XFL12E4XJXWK
source /opt/xilinx/Vivado/2024.2/settings64.sh
export V80_SERIAL_NUMBER=XFL12NWY3ZKG
source /opt/amd/Vivado/2024.2/settings64.sh
make bench_hlapi_erc20_hpu
make bench_hlapi_hpu

View File

@@ -29,7 +29,7 @@ permissions: {}
jobs:
prepare-matrix:
name: Prepare operations matrix
runs-on: v80-desktop
runs-on: v80-marais
outputs:
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
steps:
@@ -58,7 +58,7 @@ jobs:
integer-benchmarks-hpu:
name: benchmark_hpu_integer/integer-benchmarks-hpu
needs: prepare-matrix
runs-on: v80-desktop
runs-on: v80-marais
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -114,8 +114,8 @@ jobs:
- name: Run benchmarks
run: |
make pull_hpu_files
export V80_SERIAL_NUMBER=XFL12E4XJXWK
source /opt/xilinx/Vivado/2024.2/settings64.sh
export V80_SERIAL_NUMBER=XFL12NWY3ZKG
source /opt/amd/Vivado/2024.2/settings64.sh
make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
env:
BENCH_TYPE: ${{ matrix.bench_type }}

View File

@@ -1332,7 +1332,7 @@ bench_integer_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain

View File

@@ -78,7 +78,7 @@
#implementation = "Ilp"
implementation = "Llt"
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
min_batch_size = 9
min_batch_size = 12
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
size 84230351
oid sha256:3eb7619c9fb31dde691f08f963968ed60ec2892f3fab16b9c6c2353f9605efe8
size 82307506

View File

@@ -176,7 +176,7 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
imm: 1,
};
pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
pub const SIMD_N: usize = 12; //TODO: We need to come up with a way to have this dynamic
pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
dst: [VarMode::Native; SIMD_N],
src: [VarMode::Native; 2 * SIMD_N],

View File

@@ -377,7 +377,7 @@ impl AmiDriver {
if ack_str.is_empty() {
0
} else {
let ack_nb = ack_str.as_str().trim_ascii().parse::<u32>().unwrap();
let ack_nb = ack_str.as_str().lines().map(|line| line.trim_ascii().parse::<u32>().unwrap()).sum();
tracing::trace!("Get value {ack_str} from {ami_proc_path} => {ack_nb}",);
ack_nb
}

View File

@@ -22,7 +22,7 @@ V80_PCIE_DEV="unselected"
XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
# V80 bitstream refresh require insmod of ami.ko module
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/e55d02d"}
# Parse user CLI ##############################################################
opt_short="hc:l:p:"

View File

@@ -663,7 +663,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
.len()
/ 3;
let mut rng = thread_rng();
for num_elems in [2, 10] {
for num_elems in [2, 8] {
let real_num_elems = num_elems * (hpu_simd_n as u64);
group.throughput(Throughput::Elements(real_num_elems));
let bench_id =