fuse embedding for bert on red (#9925)

also updated BEAM param and use AMD driver for actual run. 535ms step
This commit is contained in:
chenyu
2025-04-18 07:20:25 -04:00
committed by GitHub
parent b58decac0c
commit 617b45748f
3 changed files with 14 additions and 4 deletions

View File

@@ -4,7 +4,9 @@ export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=5 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BEAM_LOG_SURPASS_MAX=1
export BASEDIR="/raid/datasets/wiki"

View File

@@ -4,7 +4,9 @@ export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=5 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"

View File

@@ -5,7 +5,9 @@ export MODEL="bert"
export SUBMISSION_PLATFORM="tinybox_red"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export FUSE_ARANGE=1 FUSE_ARANGE_UINT=0
export BEAM=5 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
export BASEDIR="/raid/datasets/wiki"
@@ -16,9 +18,13 @@ export SEED=$RANDOM
DATETIME=$(date "+%m%d%H%M")
LOGFILE="bert_red_${DATETIME}_${SEED}.log"
export HCQDEV_WAIT_TIMEOUT_MS=100000 # prevents hang?
# init
sudo rmmod amdgpu || true
sleep 5 && sudo rmmod amdgpu || true
BENCHMARK=10 INITMLPERF=1 BERT_LAYERS=2 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
# TODO: AM driver resulted in nan
sudo modprobe amdgpu
PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE