mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-07 22:04:10 -05:00
feat(hpu): move to new bitstream at 400Mhz with GRAM_NB 3
- update SIMD_N and min_batch_size to 12 which seems to give better latency and ERC20 throughput - support IOp on several lines in ami /proc file - reduce amount of ERC_20_SIMD per batch in HLAPI bench
This commit is contained in:
committed by
Pierre Gardrat
parent
da223b36b6
commit
39b81a8ded
6
.github/workflows/benchmark_hpu_hlapi.yml
vendored
6
.github/workflows/benchmark_hpu_hlapi.yml
vendored
@@ -16,7 +16,7 @@ permissions: {}
|
|||||||
jobs:
|
jobs:
|
||||||
hlapi-benchmarks-hpu:
|
hlapi-benchmarks-hpu:
|
||||||
name: Execute HLAPI benchmarks for HPU backend
|
name: Execute HLAPI benchmarks for HPU backend
|
||||||
runs-on: v80-desktop
|
runs-on: v80-marais
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}_${{ github.ref }}
|
group: ${{ github.workflow }}_${{ github.ref }}
|
||||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||||
@@ -63,8 +63,8 @@ jobs:
|
|||||||
- name: Run benchmarks
|
- name: Run benchmarks
|
||||||
run: |
|
run: |
|
||||||
make pull_hpu_files
|
make pull_hpu_files
|
||||||
export V80_SERIAL_NUMBER=XFL12E4XJXWK
|
export V80_SERIAL_NUMBER=XFL12NWY3ZKG
|
||||||
source /opt/xilinx/Vivado/2024.2/settings64.sh
|
source /opt/amd/Vivado/2024.2/settings64.sh
|
||||||
make bench_hlapi_erc20_hpu
|
make bench_hlapi_erc20_hpu
|
||||||
make bench_hlapi_hpu
|
make bench_hlapi_hpu
|
||||||
|
|
||||||
|
|||||||
8
.github/workflows/benchmark_hpu_integer.yml
vendored
8
.github/workflows/benchmark_hpu_integer.yml
vendored
@@ -29,7 +29,7 @@ permissions: {}
|
|||||||
jobs:
|
jobs:
|
||||||
prepare-matrix:
|
prepare-matrix:
|
||||||
name: Prepare operations matrix
|
name: Prepare operations matrix
|
||||||
runs-on: v80-desktop
|
runs-on: v80-marais
|
||||||
outputs:
|
outputs:
|
||||||
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
||||||
steps:
|
steps:
|
||||||
@@ -58,7 +58,7 @@ jobs:
|
|||||||
integer-benchmarks-hpu:
|
integer-benchmarks-hpu:
|
||||||
name: benchmark_hpu_integer/integer-benchmarks-hpu
|
name: benchmark_hpu_integer/integer-benchmarks-hpu
|
||||||
needs: prepare-matrix
|
needs: prepare-matrix
|
||||||
runs-on: v80-desktop
|
runs-on: v80-marais
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}_${{ github.ref }}
|
group: ${{ github.workflow }}_${{ github.ref }}
|
||||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||||
@@ -114,8 +114,8 @@ jobs:
|
|||||||
- name: Run benchmarks
|
- name: Run benchmarks
|
||||||
run: |
|
run: |
|
||||||
make pull_hpu_files
|
make pull_hpu_files
|
||||||
export V80_SERIAL_NUMBER=XFL12E4XJXWK
|
export V80_SERIAL_NUMBER=XFL12NWY3ZKG
|
||||||
source /opt/xilinx/Vivado/2024.2/settings64.sh
|
source /opt/amd/Vivado/2024.2/settings64.sh
|
||||||
make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
|
make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
|
||||||
env:
|
env:
|
||||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||||
|
|||||||
2
Makefile
2
Makefile
@@ -1332,7 +1332,7 @@ bench_integer_hpu: install_rs_check_toolchain
|
|||||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||||
--bench integer-bench \
|
--bench integer-bench \
|
||||||
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
|
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
|
||||||
|
|
||||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||||
bench_integer_compression: install_rs_check_toolchain
|
bench_integer_compression: install_rs_check_toolchain
|
||||||
|
|||||||
@@ -78,7 +78,7 @@
|
|||||||
#implementation = "Ilp"
|
#implementation = "Ilp"
|
||||||
implementation = "Llt"
|
implementation = "Llt"
|
||||||
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
|
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
|
||||||
min_batch_size = 9
|
min_batch_size = 12
|
||||||
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
|
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
|
||||||
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
|
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
|
||||||
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
|
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
|
oid sha256:3eb7619c9fb31dde691f08f963968ed60ec2892f3fab16b9c6c2353f9605efe8
|
||||||
size 84230351
|
size 82307506
|
||||||
|
|||||||
@@ -176,7 +176,7 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
|
|||||||
imm: 1,
|
imm: 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
|
pub const SIMD_N: usize = 12; //TODO: We need to come up with a way to have this dynamic
|
||||||
pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
|
pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
|
||||||
dst: [VarMode::Native; SIMD_N],
|
dst: [VarMode::Native; SIMD_N],
|
||||||
src: [VarMode::Native; 2 * SIMD_N],
|
src: [VarMode::Native; 2 * SIMD_N],
|
||||||
|
|||||||
@@ -377,7 +377,7 @@ impl AmiDriver {
|
|||||||
if ack_str.is_empty() {
|
if ack_str.is_empty() {
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
let ack_nb = ack_str.as_str().trim_ascii().parse::<u32>().unwrap();
|
let ack_nb = ack_str.as_str().lines().map(|line| line.trim_ascii().parse::<u32>().unwrap()).sum();
|
||||||
tracing::trace!("Get value {ack_str} from {ami_proc_path} => {ack_nb}",);
|
tracing::trace!("Get value {ack_str} from {ami_proc_path} => {ack_nb}",);
|
||||||
ack_nb
|
ack_nb
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ V80_PCIE_DEV="unselected"
|
|||||||
XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
|
XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
|
||||||
|
|
||||||
# V80 bitstream refresh require insmod of ami.ko module
|
# V80 bitstream refresh require insmod of ami.ko module
|
||||||
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}
|
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/e55d02d"}
|
||||||
|
|
||||||
# Parse user CLI ##############################################################
|
# Parse user CLI ##############################################################
|
||||||
opt_short="hc:l:p:"
|
opt_short="hc:l:p:"
|
||||||
|
|||||||
@@ -663,7 +663,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
|
|||||||
.len()
|
.len()
|
||||||
/ 3;
|
/ 3;
|
||||||
let mut rng = thread_rng();
|
let mut rng = thread_rng();
|
||||||
for num_elems in [2, 10] {
|
for num_elems in [2, 8] {
|
||||||
let real_num_elems = num_elems * (hpu_simd_n as u64);
|
let real_num_elems = num_elems * (hpu_simd_n as u64);
|
||||||
group.throughput(Throughput::Elements(real_num_elems));
|
group.throughput(Throughput::Elements(real_num_elems));
|
||||||
let bench_id =
|
let bench_id =
|
||||||
|
|||||||
Reference in New Issue
Block a user