From 617b45748f75f402810099ecb623610cff6f2ca9 Mon Sep 17 00:00:00 2001 From: chenyu Date: Fri, 18 Apr 2025 07:20:25 -0400 Subject: [PATCH] fuse embedding for bert on red (#9925) also updated BEAM param and use AMD driver for actual run. 535ms step --- .../bert/implementations/tinybox_red/dev_beam.sh | 4 +++- .../bert/implementations/tinybox_red/dev_run.sh | 4 +++- .../bert/implementations/tinybox_red/run_and_time.sh | 10 ++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh index 17766f40bb..a31d1d7e3c 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh @@ -4,7 +4,9 @@ export PYTHONPATH="." export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 -export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=5 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 export BEAM_LOG_SURPASS_MAX=1 export BASEDIR="/raid/datasets/wiki" diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh index 68c1bc459d..5029bdf4c8 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh @@ -4,7 +4,9 @@ export PYTHONPATH="." export MODEL="bert" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 -export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=5 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 export BASEDIR="/raid/datasets/wiki" diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh index 899badcc14..3c5ef139ac 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh @@ -5,7 +5,9 @@ export MODEL="bert" export SUBMISSION_PLATFORM="tinybox_red" export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96 -export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 + +export BEAM=5 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 export IGNORE_JIT_FIRST_BEAM=1 export BASEDIR="/raid/datasets/wiki" @@ -16,9 +18,13 @@ export SEED=$RANDOM DATETIME=$(date "+%m%d%H%M") LOGFILE="bert_red_${DATETIME}_${SEED}.log" +export HCQDEV_WAIT_TIMEOUT_MS=100000 # prevents hang? + # init -sudo rmmod amdgpu || true +sleep 5 && sudo rmmod amdgpu || true BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE # run +# TODO: AM driver resulted in nan +sudo modprobe amdgpu PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE