mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-05 04:44:41 -05:00
feat(hpu): move to new bitstream at 400Mhz with GRAM_NB 3
- update SIMD_N and min_batch_size to 12 which seems to give better latency and ERC20 throughput - support IOp on several lines in ami /proc file - reduce amount of ERC_20_SIMD per batch in HLAPI bench
This commit is contained in:
committed by
Pierre Gardrat
parent
da223b36b6
commit
39b81a8ded
6
.github/workflows/benchmark_hpu_hlapi.yml
vendored
6
.github/workflows/benchmark_hpu_hlapi.yml
vendored
@@ -16,7 +16,7 @@ permissions: {}
|
||||
jobs:
|
||||
hlapi-benchmarks-hpu:
|
||||
name: Execute HLAPI benchmarks for HPU backend
|
||||
runs-on: v80-desktop
|
||||
runs-on: v80-marais
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -63,8 +63,8 @@ jobs:
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make pull_hpu_files
|
||||
export V80_SERIAL_NUMBER=XFL12E4XJXWK
|
||||
source /opt/xilinx/Vivado/2024.2/settings64.sh
|
||||
export V80_SERIAL_NUMBER=XFL12NWY3ZKG
|
||||
source /opt/amd/Vivado/2024.2/settings64.sh
|
||||
make bench_hlapi_erc20_hpu
|
||||
make bench_hlapi_hpu
|
||||
|
||||
|
||||
8
.github/workflows/benchmark_hpu_integer.yml
vendored
8
.github/workflows/benchmark_hpu_integer.yml
vendored
@@ -29,7 +29,7 @@ permissions: {}
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
name: Prepare operations matrix
|
||||
runs-on: v80-desktop
|
||||
runs-on: v80-marais
|
||||
outputs:
|
||||
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
||||
steps:
|
||||
@@ -58,7 +58,7 @@ jobs:
|
||||
integer-benchmarks-hpu:
|
||||
name: benchmark_hpu_integer/integer-benchmarks-hpu
|
||||
needs: prepare-matrix
|
||||
runs-on: v80-desktop
|
||||
runs-on: v80-marais
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -114,8 +114,8 @@ jobs:
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make pull_hpu_files
|
||||
export V80_SERIAL_NUMBER=XFL12E4XJXWK
|
||||
source /opt/xilinx/Vivado/2024.2/settings64.sh
|
||||
export V80_SERIAL_NUMBER=XFL12NWY3ZKG
|
||||
source /opt/amd/Vivado/2024.2/settings64.sh
|
||||
make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
|
||||
env:
|
||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||
|
||||
2
Makefile
2
Makefile
@@ -1332,7 +1332,7 @@ bench_integer_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
|
||||
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
|
||||
|
||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||
bench_integer_compression: install_rs_check_toolchain
|
||||
|
||||
@@ -78,7 +78,7 @@
|
||||
#implementation = "Ilp"
|
||||
implementation = "Llt"
|
||||
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
|
||||
min_batch_size = 9
|
||||
min_batch_size = 12
|
||||
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
|
||||
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
|
||||
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
|
||||
size 84230351
|
||||
oid sha256:3eb7619c9fb31dde691f08f963968ed60ec2892f3fab16b9c6c2353f9605efe8
|
||||
size 82307506
|
||||
|
||||
@@ -176,7 +176,7 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
|
||||
imm: 1,
|
||||
};
|
||||
|
||||
pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
|
||||
pub const SIMD_N: usize = 12; //TODO: We need to come up with a way to have this dynamic
|
||||
pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
|
||||
dst: [VarMode::Native; SIMD_N],
|
||||
src: [VarMode::Native; 2 * SIMD_N],
|
||||
|
||||
@@ -377,7 +377,7 @@ impl AmiDriver {
|
||||
if ack_str.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let ack_nb = ack_str.as_str().trim_ascii().parse::<u32>().unwrap();
|
||||
let ack_nb = ack_str.as_str().lines().map(|line| line.trim_ascii().parse::<u32>().unwrap()).sum();
|
||||
tracing::trace!("Get value {ack_str} from {ami_proc_path} => {ack_nb}",);
|
||||
ack_nb
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ V80_PCIE_DEV="unselected"
|
||||
XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
|
||||
|
||||
# V80 bitstream refresh require insmod of ami.ko module
|
||||
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}
|
||||
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/e55d02d"}
|
||||
|
||||
# Parse user CLI ##############################################################
|
||||
opt_short="hc:l:p:"
|
||||
|
||||
@@ -663,7 +663,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
|
||||
.len()
|
||||
/ 3;
|
||||
let mut rng = thread_rng();
|
||||
for num_elems in [2, 10] {
|
||||
for num_elems in [2, 8] {
|
||||
let real_num_elems = num_elems * (hpu_simd_n as u64);
|
||||
group.throughput(Throughput::Elements(real_num_elems));
|
||||
let bench_id =
|
||||
|
||||
Reference in New Issue
Block a user