bert dev_beam for mi300x box (#9648)

* bert dev_beam for mi300x box

* terminate BENCHMARK properly
This commit is contained in:
chenyu
2025-03-31 08:35:51 -04:00
committed by GitHub
parent 5171b098e5
commit f7cb2e8da3
2 changed files with 19 additions and 1 deletions

View File

@@ -853,7 +853,7 @@ def train_bert():
et = time.time()
eval_times.append(et - st)
if BENCHMARK and j == BENCHMARK:
if BENCHMARK and (j+1) == min(BENCHMARK, max_eval_steps):
# assume INITMLPERF has BENCHMARK set
if MLLOGGER and INITMLPERF:
MLLOGGER.event(key=mllog_constants.INIT_STOP, value=None)
@@ -900,6 +900,9 @@ def train_bert():
# stop once hitting the target
break
# should not happen, BENCHMARK not properly terminated
if BENCHMARK: assert i < BENCHMARK, i
if getenv("CKPT") and i % save_ckpt_freq == 0:
if MLLOGGER and RUNMLPERF:
if previous_step:

View File

@@ -0,0 +1,15 @@
#!/bin/bash
export PYTHONPATH="."
export MODEL="bert"
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=8 BS=1024 EVAL_BS=1024
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
export IGNORE_JIT_FIRST_BEAM=1
# export BEAM_LOG_SURPASS_MAX=1
# export BASEDIR="/raid/datasets/wiki"
export RESET_STEP=1
export BENCHMARK=10 DEBUG=2
python3 examples/mlperf/model_train.py