diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 56a3f728da..4a8192c80f 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -853,7 +853,7 @@ def train_bert(): et = time.time() eval_times.append(et - st) - if BENCHMARK and j == BENCHMARK: + if BENCHMARK and (j+1) == min(BENCHMARK, max_eval_steps): # assume INITMLPERF has BENCHMARK set if MLLOGGER and INITMLPERF: MLLOGGER.event(key=mllog_constants.INIT_STOP, value=None) @@ -900,6 +900,9 @@ def train_bert(): # stop once hitting the target break + # should not happen, BENCHMARK not properly terminated + if BENCHMARK: assert i < BENCHMARK, i + if getenv("CKPT") and i % save_ckpt_freq == 0: if MLLOGGER and RUNMLPERF: if previous_step: diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh new file mode 100755 index 0000000000..1b6ff92c14 --- /dev/null +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_amd/dev_beam.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export PYTHONPATH="." +export MODEL="bert" +export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=8 BS=1024 EVAL_BS=1024 + +export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 +export IGNORE_JIT_FIRST_BEAM=1 +# export BEAM_LOG_SURPASS_MAX=1 +# export BASEDIR="/raid/datasets/wiki" + +export RESET_STEP=1 +export BENCHMARK=10 DEBUG=2 + +python3 examples/mlperf/model_train.py